- 
          
- 
                Notifications
    You must be signed in to change notification settings 
- Fork 1.2k
Compatibility with Zarr v3b2 #9795
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
          
     Merged
      
      
    
  
     Merged
                    Changes from all commits
      Commits
    
    
            Show all changes
          
          
            9 commits
          
        
        Select commit
          Hold shift + click to select a range
      
      e18e650
              
                Compatibility with Zarr v3b2
              
              
                dcherian c0af9ee
              
                More guards with mode="w"
              
              
                dcherian a990970
              
                refactoring
              
              
                dcherian c07ed88
              
                tweak expected requestsC
              
              
                dcherian aa522e7
              
                Merge branch 'main' into zarr-v3-b2
              
              
                dcherian 4563efe
              
                compat
              
              
                dcherian ca30cd8
              
                more compat
              
              
                dcherian f2e91e9
              
                fix
              
              
                dcherian 12f577a
              
                Merge branch 'main' into zarr-v3-b2
              
              
                dcherian File filter
Filter by extension
Conversations
          Failed to load comments.   
        
        
          
      Loading
        
  Jump to
        
          Jump to file
        
      
      
          Failed to load files.   
        
        
          
      Loading
        
  Diff view
Diff view
There are no files selected for viewing
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              | Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -37,6 +37,7 @@ | |
| from xarray.namedarray.utils import module_available | ||
|  | ||
| if TYPE_CHECKING: | ||
| from zarr import Array as ZarrArray | ||
| from zarr import Group as ZarrGroup | ||
|  | ||
| from xarray.backends.common import AbstractDataStore | ||
|  | @@ -443,7 +444,7 @@ def extract_zarr_variable_encoding( | |
| shape = shape if shape else variable.shape | ||
| encoding = variable.encoding.copy() | ||
|  | ||
| safe_to_drop = {"source", "original_shape"} | ||
| safe_to_drop = {"source", "original_shape", "preferred_chunks"} | ||
| valid_encodings = { | ||
| "codecs", | ||
| "chunks", | ||
|  | @@ -871,16 +872,27 @@ def store( | |
| else: | ||
| zarr = attempt_import("zarr") | ||
|  | ||
| existing_keys = tuple(self.zarr_group.array_keys()) | ||
| if self._mode == "w": | ||
| # always overwrite, so we don't care about existing names, | ||
| # and consistency of encoding | ||
| new_variable_names = set(variables) | ||
| existing_keys = {} | ||
| existing_variable_names = {} | ||
| else: | ||
| existing_keys = tuple(self.zarr_group.array_keys()) | ||
| existing_variable_names = { | ||
| vn for vn in variables if _encode_variable_name(vn) in existing_keys | ||
| } | ||
| new_variable_names = set(variables) - existing_variable_names | ||
|  | ||
| if self._mode == "r+": | ||
| new_names = [k for k in variables if k not in existing_keys] | ||
| if new_names: | ||
| raise ValueError( | ||
| f"dataset contains non-pre-existing variables {new_names}, " | ||
| "which is not allowed in ``xarray.Dataset.to_zarr()`` with " | ||
| "``mode='r+'``. To allow writing new variables, set ``mode='a'``." | ||
| ) | ||
| if self._mode == "r+" and ( | ||
| new_names := [k for k in variables if k not in existing_keys] | ||
| ): | ||
| raise ValueError( | ||
| f"dataset contains non-pre-existing variables {new_names!r}, " | ||
| "which is not allowed in ``xarray.Dataset.to_zarr()`` with " | ||
| "``mode='r+'``. To allow writing new variables, set ``mode='a'``." | ||
| ) | ||
|  | ||
| if self._append_dim is not None and self._append_dim not in existing_keys: | ||
| # For dimensions without coordinate values, we must parse | ||
|  | @@ -895,10 +907,6 @@ def store( | |
| f"dataset dimensions {existing_dims}" | ||
| ) | ||
|  | ||
| existing_variable_names = { | ||
| vn for vn in variables if _encode_variable_name(vn) in existing_keys | ||
| } | ||
| new_variable_names = set(variables) - existing_variable_names | ||
| variables_encoded, attributes = self.encode( | ||
| {vn: variables[vn] for vn in new_variable_names}, attributes | ||
| ) | ||
|  | @@ -920,10 +928,9 @@ def store( | |
| # Modified variables must use the same encoding as the store. | ||
| vars_with_encoding = {} | ||
| for vn in existing_variable_names: | ||
| if self._mode in ["a", "a-", "r+"]: | ||
| _validate_datatypes_for_zarr_append( | ||
| vn, existing_vars[vn], variables[vn] | ||
| ) | ||
| _validate_datatypes_for_zarr_append( | ||
| vn, existing_vars[vn], variables[vn] | ||
| ) | ||
| vars_with_encoding[vn] = variables[vn].copy(deep=False) | ||
| vars_with_encoding[vn].encoding = existing_vars[vn].encoding | ||
| vars_with_encoding, _ = self.encode(vars_with_encoding, {}) | ||
|  | @@ -968,6 +975,69 @@ def store( | |
| def sync(self): | ||
| pass | ||
|  | ||
| def _open_existing_array(self, *, name) -> ZarrArray: | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just moved it around so  | ||
| import zarr | ||
|  | ||
| # TODO: if mode="a", consider overriding the existing variable | ||
| # metadata. This would need some case work properly with region | ||
| # and append_dim. | ||
| if self._write_empty is not None: | ||
| # Write to zarr_group.chunk_store instead of zarr_group.store | ||
| # See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation | ||
| # The open_consolidated() enforces a mode of r or r+ | ||
| # (and to_zarr with region provided enforces a read mode of r+), | ||
| # and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore | ||
| # and a 'normal Store subtype for chunk_store. | ||
| # The exact type depends on if a local path was used, or a URL of some sort, | ||
| # but the point is that it's not a read-only ConsolidatedMetadataStore. | ||
| # It is safe to write chunk data to the chunk_store because no metadata would be changed by | ||
| # to_zarr with the region parameter: | ||
| # - Because the write mode is enforced to be r+, no new variables can be added to the store | ||
| # (this is also checked and enforced in xarray.backends.api.py::to_zarr()). | ||
| # - Existing variables already have their attrs included in the consolidated metadata file. | ||
| # - The size of dimensions can not be expanded, that would require a call using `append_dim` | ||
| # which is mutually exclusive with `region` | ||
| zarr_array = zarr.open( | ||
| store=( | ||
| self.zarr_group.store if _zarr_v3() else self.zarr_group.chunk_store | ||
| ), | ||
| # TODO: see if zarr should normalize these strings. | ||
| path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip("/"), | ||
| write_empty_chunks=self._write_empty, | ||
| ) | ||
| else: | ||
| zarr_array = self.zarr_group[name] | ||
|  | ||
| return zarr_array | ||
|  | ||
| def _create_new_array( | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just moved it around so  | ||
| self, *, name, shape, dtype, fill_value, encoding, attrs | ||
| ) -> ZarrArray: | ||
| if coding.strings.check_vlen_dtype(dtype) is str: | ||
| dtype = str | ||
|  | ||
| if self._write_empty is not None: | ||
| if ( | ||
| "write_empty_chunks" in encoding | ||
| and encoding["write_empty_chunks"] != self._write_empty | ||
| ): | ||
| raise ValueError( | ||
| 'Differing "write_empty_chunks" values in encoding and parameters' | ||
| f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }' | ||
| ) | ||
| else: | ||
| encoding["write_empty_chunks"] = self._write_empty | ||
|  | ||
| zarr_array = self.zarr_group.create( | ||
| name, | ||
| shape=shape, | ||
| dtype=dtype, | ||
| fill_value=fill_value, | ||
| **encoding, | ||
| ) | ||
| zarr_array = _put_attrs(zarr_array, attrs) | ||
| return zarr_array | ||
|  | ||
| def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=None): | ||
| """ | ||
| This provides a centralized method to set the variables on the data | ||
|  | @@ -986,8 +1056,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No | |
| dimensions. | ||
| """ | ||
|  | ||
| import zarr | ||
|  | ||
| existing_keys = tuple(self.zarr_group.array_keys()) | ||
| is_zarr_v3_format = _zarr_v3() and self.zarr_group.metadata.zarr_format == 3 | ||
|  | ||
|  | @@ -1016,47 +1084,13 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No | |
| else: | ||
| del v.encoding["_FillValue"] | ||
|  | ||
| zarr_array = None | ||
| zarr_shape = None | ||
| write_region = self._write_region if self._write_region is not None else {} | ||
| write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} | ||
|  | ||
| if name in existing_keys: | ||
| if self._mode != "w" and name in existing_keys: | ||
| # existing variable | ||
| # TODO: if mode="a", consider overriding the existing variable | ||
| # metadata. This would need some case work properly with region | ||
| # and append_dim. | ||
| if self._write_empty is not None: | ||
| # Write to zarr_group.chunk_store instead of zarr_group.store | ||
| # See https://github.com/pydata/xarray/pull/8326#discussion_r1365311316 for a longer explanation | ||
| # The open_consolidated() enforces a mode of r or r+ | ||
| # (and to_zarr with region provided enforces a read mode of r+), | ||
| # and this function makes sure the resulting Group has a store of type ConsolidatedMetadataStore | ||
| # and a 'normal Store subtype for chunk_store. | ||
| # The exact type depends on if a local path was used, or a URL of some sort, | ||
| # but the point is that it's not a read-only ConsolidatedMetadataStore. | ||
| # It is safe to write chunk data to the chunk_store because no metadata would be changed by | ||
| # to_zarr with the region parameter: | ||
| # - Because the write mode is enforced to be r+, no new variables can be added to the store | ||
| # (this is also checked and enforced in xarray.backends.api.py::to_zarr()). | ||
| # - Existing variables already have their attrs included in the consolidated metadata file. | ||
| # - The size of dimensions can not be expanded, that would require a call using `append_dim` | ||
| # which is mutually exclusive with `region` | ||
| zarr_array = zarr.open( | ||
| store=( | ||
| self.zarr_group.store | ||
| if _zarr_v3() | ||
| else self.zarr_group.chunk_store | ||
| ), | ||
| # TODO: see if zarr should normalize these strings. | ||
| path="/".join([self.zarr_group.name.rstrip("/"), name]).lstrip( | ||
| "/" | ||
| ), | ||
| write_empty_chunks=self._write_empty, | ||
| ) | ||
| else: | ||
| zarr_array = self.zarr_group[name] | ||
|  | ||
| zarr_array = self._open_existing_array(name=name) | ||
| if self._append_dim is not None and self._append_dim in dims: | ||
| # resize existing variable | ||
| append_axis = dims.index(self._append_dim) | ||
|  | @@ -1089,40 +1123,27 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No | |
| shape=zarr_shape, | ||
| ) | ||
|  | ||
| if name not in existing_keys: | ||
| if self._mode == "w" or name not in existing_keys: | ||
| # new variable | ||
| encoded_attrs = {} | ||
| encoded_attrs = {k: self.encode_attribute(v) for k, v in attrs.items()} | ||
| # the magic for storing the hidden dimension data | ||
| if is_zarr_v3_format: | ||
| encoding["dimension_names"] = dims | ||
| else: | ||
| encoded_attrs[DIMENSION_KEY] = dims | ||
| for k2, v2 in attrs.items(): | ||
| encoded_attrs[k2] = self.encode_attribute(v2) | ||
|  | ||
| if coding.strings.check_vlen_dtype(dtype) is str: | ||
| dtype = str | ||
|  | ||
| if self._write_empty is not None: | ||
| if ( | ||
| "write_empty_chunks" in encoding | ||
| and encoding["write_empty_chunks"] != self._write_empty | ||
| ): | ||
| raise ValueError( | ||
| 'Differing "write_empty_chunks" values in encoding and parameters' | ||
| f'Got {encoding["write_empty_chunks"] = } and {self._write_empty = }' | ||
| ) | ||
| else: | ||
| encoding["write_empty_chunks"] = self._write_empty | ||
|  | ||
| zarr_array = self.zarr_group.create( | ||
| name, | ||
| shape=shape, | ||
|  | ||
| encoding["exists_ok" if _zarr_v3() else "overwrite"] = ( | ||
| True if self._mode == "w" else False | ||
| ) | ||
|  | ||
| zarr_array = self._create_new_array( | ||
| name=name, | ||
| dtype=dtype, | ||
| shape=shape, | ||
| fill_value=fill_value, | ||
| **encoding, | ||
| encoding=encoding, | ||
| attrs=encoded_attrs, | ||
| ) | ||
| zarr_array = _put_attrs(zarr_array, encoded_attrs) | ||
|  | ||
| writer.add(v.data, zarr_array, region) | ||
|  | ||
|  | ||
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
              
      
      Oops, something went wrong.
        
    
  
      
      Oops, something went wrong.
        
    
  
  Add this suggestion to a batch that can be applied as a single commit.
  This suggestion is invalid because no changes were made to the code.
  Suggestions cannot be applied while the pull request is closed.
  Suggestions cannot be applied while viewing a subset of changes.
  Only one suggestion per line can be applied in a batch.
  Add this suggestion to a batch that can be applied as a single commit.
  Applying suggestions on deleted lines is not supported.
  You must change the existing code in this line in order to create a valid suggestion.
  Outdated suggestions cannot be applied.
  This suggestion has been applied or marked resolved.
  Suggestions cannot be applied from pending reviews.
  Suggestions cannot be applied on multi-line comments.
  Suggestions cannot be applied while the pull request is queued to merge.
  Suggestion cannot be applied right now. Please check back later.
  
    
  
    
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
logic change. We now skip all these checks for mode="w". i think earlier, zarr would erase the group so all the logic below was not activated. Now that zarr does not erase the group, we must skip all the logic below.