3.3 xarray
Groupby#
Lesson Content
groupby
groupby bins
Context#
Yesterday we began to explore working with data in xarray. Today we are going to dig into that even deeper with a concept called groupby. Groupby is going to allow us to split our data up into different categories and analyze them based on those categories. It sounds a bit abstract right now, but just wait - it’s powerful!
import xarray as xr
sst = xr.open_dataset("https://www.ncei.noaa.gov/thredds/dodsC/OisstBase/NetCDF/V2.1/AVHRR/198210/oisst-avhrr-v02r01.19821007.nc")
sst = sst['sst'].squeeze(dim='zlev', drop=True)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/file_manager.py:211, in CachingFileManager._acquire_with_cache_info(self, needs_lock)
210 try:
--> 211 file = self._cache[self._key]
212 except KeyError:
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/lru_cache.py:56, in LRUCache.__getitem__(self, key)
55 with self._lock:
---> 56 value = self._cache[key]
57 self._cache.move_to_end(key)
KeyError: [<class 'netCDF4._netCDF4.Dataset'>, ('https://www.ncei.noaa.gov/thredds/dodsC/OisstBase/NetCDF/V2.1/AVHRR/198210/oisst-avhrr-v02r01.19821007.nc',), 'r', (('clobber', True), ('diskless', False), ('format', 'NETCDF4'), ('persist', False)), '8b44a4ac-5d9b-4b62-907f-4b3dc2ea51a6']
During handling of the above exception, another exception occurred:
OSError Traceback (most recent call last)
Cell In[2], line 1
----> 1 sst = xr.open_dataset("https://www.ncei.noaa.gov/thredds/dodsC/OisstBase/NetCDF/V2.1/AVHRR/198210/oisst-avhrr-v02r01.19821007.nc")
3 sst = sst['sst'].squeeze(dim='zlev', drop=True)
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/api.py:679, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs)
667 decoders = _resolve_decoders_kwargs(
668 decode_cf,
669 open_backend_dataset_parameters=backend.open_dataset_parameters,
(...)
675 decode_coords=decode_coords,
676 )
678 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 679 backend_ds = backend.open_dataset(
680 filename_or_obj,
681 drop_variables=drop_variables,
682 **decoders,
683 **kwargs,
684 )
685 ds = _dataset_from_backend_dataset(
686 backend_ds,
687 filename_or_obj,
(...)
697 **kwargs,
698 )
699 return ds
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/netCDF4_.py:666, in NetCDF4BackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, format, clobber, diskless, persist, auto_complex, lock, autoclose)
644 def open_dataset(
645 self,
646 filename_or_obj: str | os.PathLike[Any] | ReadBuffer | AbstractDataStore,
(...)
663 autoclose=False,
664 ) -> Dataset:
665 filename_or_obj = _normalize_path(filename_or_obj)
--> 666 store = NetCDF4DataStore.open(
667 filename_or_obj,
668 mode=mode,
669 format=format,
670 group=group,
671 clobber=clobber,
672 diskless=diskless,
673 persist=persist,
674 auto_complex=auto_complex,
675 lock=lock,
676 autoclose=autoclose,
677 )
679 store_entrypoint = StoreBackendEntrypoint()
680 with close_on_error(store):
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/netCDF4_.py:452, in NetCDF4DataStore.open(cls, filename, mode, format, group, clobber, diskless, persist, auto_complex, lock, lock_maker, autoclose)
448 kwargs["auto_complex"] = auto_complex
449 manager = CachingFileManager(
450 netCDF4.Dataset, filename, mode=mode, kwargs=kwargs
451 )
--> 452 return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/netCDF4_.py:393, in NetCDF4DataStore.__init__(self, manager, group, mode, lock, autoclose)
391 self._group = group
392 self._mode = mode
--> 393 self.format = self.ds.data_model
394 self._filename = self.ds.filepath()
395 self.is_remote = is_remote_uri(self._filename)
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/netCDF4_.py:461, in NetCDF4DataStore.ds(self)
459 @property
460 def ds(self):
--> 461 return self._acquire()
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/netCDF4_.py:455, in NetCDF4DataStore._acquire(self, needs_lock)
454 def _acquire(self, needs_lock=True):
--> 455 with self._manager.acquire_context(needs_lock) as root:
456 ds = _nc4_require_group(root, self._group, self._mode)
457 return ds
File ~/miniconda3/envs/sarp/lib/python3.13/contextlib.py:141, in _GeneratorContextManager.__enter__(self)
139 del self.args, self.kwds, self.func
140 try:
--> 141 return next(self.gen)
142 except StopIteration:
143 raise RuntimeError("generator didn't yield") from None
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/file_manager.py:199, in CachingFileManager.acquire_context(self, needs_lock)
196 @contextlib.contextmanager
197 def acquire_context(self, needs_lock=True):
198 """Context manager for acquiring a file."""
--> 199 file, cached = self._acquire_with_cache_info(needs_lock)
200 try:
201 yield file
File ~/miniconda3/envs/sarp/lib/python3.13/site-packages/xarray/backends/file_manager.py:217, in CachingFileManager._acquire_with_cache_info(self, needs_lock)
215 kwargs = kwargs.copy()
216 kwargs["mode"] = self._mode
--> 217 file = self._opener(*self._args, **kwargs)
218 if self._mode == "w":
219 # ensure file doesn't get overridden when opened again
220 self._mode = "a"
File src/netCDF4/_netCDF4.pyx:2521, in netCDF4._netCDF4.Dataset.__init__()
File src/netCDF4/_netCDF4.pyx:2158, in netCDF4._netCDF4._ensure_nc_success()
OSError: [Errno -90] NetCDF: file not found: 'https://www.ncei.noaa.gov/thredds/dodsC/OisstBase/NetCDF/V2.1/AVHRR/198210/oisst-avhrr-v02r01.19821007.nc'
Groupby#
While we have lots of individual gridpoints in our dataset, sometimes we don’t care about each individual reading. Instead we probably care about the aggregate of a specific group of readings.
For example:
Given the average temperature of every county in the US, what is the average temperature in each state?
Given a list of the opening dates of every Chuck E Cheese stores, how many Chuck E Cheeses were opened each year? 🧀
In xarray we answer questions like that that with groupby
.
Breaking groupby
into conceptual parts#
In addition to the dataframe, there are three main parts to a groupby:
Which variable we want to group together
How we want to group
The variable we want to see in the end
Without getting into syntax yet we can start by identifiying these in our two example questions.
Given the average temperature of every county in the US, what is the average temperature in each state?
Which variable to group together? -> We want to group counties into states
How do we want to group? -> Take the average
What variable do we want to look at? Temperature
Given a list of the opening dates of every Chuck E Cheese stores, how many Chuck E Cheeses were opened each year?
Which variable to group together? -> We want to group individual days into years
How do we want to group? -> Count them
What variable do we want to look at? Number of stores
📝 Check your understanding
Identify each of three main groupby parts in the following scenario:
Given the hourly temperatures for a location over the course of a month, what were the daily highs?
Which variable to group together?
How do we want to group?
What variable do we want to look at?
groupby
syntax#
We can take these groupby
concepts and translate them into syntax. The first two parts (which variable to group & how do we want to group) are required for pandas. The third one is optional.
Starting with just the two required variables, the general syntax is:
DATAFRAME.groupby(WHICH_GROUP).AGGREGATION()
Words in all capitals are variables. We’ll go into each part a little more below.
# We only have 1 month, so this doesn't fly here maybe on homework?
sst.groupby('time.month').mean()
<xarray.DataArray 'sst' (month: 1, lat: 720, lon: 1440)> array([[[ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [-1.1899999, -1.1899999, -1.1899999, ..., -1.1899999, -1.1899999, -1.1899999], [-1.1899999, -1.1899999, -1.1899999, ..., -1.1899999, -1.1899999, -1.1899999], [-1.1899999, -1.1899999, -1.1899999, ..., -1.1899999, -1.1899999, -1.1899999]]], dtype=float32) Coordinates: * lat (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88 * lon (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9 * month (month) int64 10 Attributes: long_name: Daily sea surface temperature units: Celsius valid_min: -300 valid_max: 4500 _ChunkSizes: [ 1 1 720 1440]
'WHICH_GROUP'
#
This can be any of the dimensions of your dataset. In physical oceanography, for example, it is common to group by latitude, so that you can see how a variable changes as you move closer to or further away from the equator.
sst.groupby('lat').mean(...)
<xarray.DataArray 'sst' (lat: 720)> array([ nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, -1.51692557e+00, -1.53166163e+00, -1.55314803e+00, -1.57018566e+00, -1.58648872e+00, -1.59056687e+00, -1.59533656e+00, -1.61027515e+00, -1.63276756e+00, -1.64728677e+00, -1.65520298e+00, -1.65427852e+00, -1.65435839e+00, -1.63652635e+00, -1.64526653e+00, -1.62598121e+00, -1.63246810e+00, -1.65207720e+00, -1.67231214e+00, -1.67294443e+00, -1.67326570e+00, -1.65209961e+00, -1.64746606e+00, -1.65392542e+00, -1.66650736e+00, -1.67372024e+00, -1.67431283e+00, -1.67534673e+00, -1.67392111e+00, -1.66620767e+00, -1.66788197e+00, -1.66215181e+00, -1.65882778e+00, -1.65802693e+00, ... 1.60739255e+00, 1.40621650e+00, 1.21297944e+00, 1.08243787e+00, 9.51626062e-01, 8.30649316e-01, 7.59354472e-01, 6.84241772e-01, 5.59581220e-01, 4.71847802e-01, 3.53565246e-01, 1.82334244e-01, 4.14617397e-02, -5.61492033e-02, -1.36128694e-01, -1.94080517e-01, -1.99803740e-01, -2.02549189e-01, -2.60961711e-01, -3.16458732e-01, -4.41025198e-01, -5.69864392e-01, -6.84798539e-01, -7.52268314e-01, -8.07088673e-01, -8.45351577e-01, -8.26967835e-01, -8.53520274e-01, -8.92809749e-01, -9.03403878e-01, -9.43648040e-01, -1.00938058e+00, -1.06675804e+00, -1.13523424e+00, -1.16256785e+00, -1.09204960e+00, -1.07783961e+00, -1.06503558e+00, -1.09915543e+00, -1.12950087e+00, -1.14294791e+00, -1.16515648e+00, -1.17709875e+00, -1.18073046e+00, -1.18820989e+00, -1.18804479e+00, -1.18709552e+00, -1.19452322e+00, -1.20153499e+00, -1.20323992e+00, -1.20159018e+00, -1.19786108e+00, -1.19466662e+00, -1.19252777e+00, -1.19127083e+00, -1.19085419e+00, -1.19068742e+00, -1.19081247e+00, -1.19086802e+00, -1.19065273e+00, -1.19079864e+00, -1.19093752e+00, -1.19086802e+00, -1.19090271e+00, -1.19074297e+00, -1.19078457e+00, -1.19062495e+00, -1.19081247e+00, -1.19086790e+00, -1.19059026e+00, -1.19079864e+00, -1.19078457e+00, -1.19057631e+00, -1.19061089e+00, -1.18999982e+00, -1.18999982e+00], dtype=float32) Coordinates: * lat (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88 Attributes: long_name: Daily sea surface temperature units: Celsius valid_min: -300 valid_max: 4500 _ChunkSizes: [ 1 1 720 1440]
AGGREGATION
#
The goal with each of the groups of data is to end up with a single value for the things in that group. To tell xarray how to gather the datapoints together we specify which function we would like it to use. Any of the aggregation functions we talked about at the beginning of the lesson work for this!
sst.groupby('lat').mean(...).plot()
[<matplotlib.lines.Line2D at 0x163c46290>]

What do we see? Hot water near the equator and chilly water near the poles.
Note
The ellipses ...
inside the .mean()
tell xarray to take the mean over all of the remaining axis. You wouldn’t have to do that - you may instead want to take the mean over just the latitude and keep the time resolution. It’s quite common, though, to want to aggregate over all remaining axis.
time dimension#
If your data has a time dimension and it is formatted as a datetime object you can take advantage of some slick grouping capabilities. For example, you can group by a time group like 'time.month'
, which will grab all make 12 groups for you, putting all the data from each month into its own group.
# We only have 1 month, so this doesn't fly here maybe on homework?
sst.groupby('time.month').mean()
<xarray.DataArray 'sst' (month: 1, lat: 720, lon: 1440)> array([[[ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], [ nan, nan, nan, ..., nan, nan, nan], ..., [-1.1899999, -1.1899999, -1.1899999, ..., -1.1899999, -1.1899999, -1.1899999], [-1.1899999, -1.1899999, -1.1899999, ..., -1.1899999, -1.1899999, -1.1899999], [-1.1899999, -1.1899999, -1.1899999, ..., -1.1899999, -1.1899999, -1.1899999]]], dtype=float32) Coordinates: * lat (lat) float32 -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88 * lon (lon) float32 0.125 0.375 0.625 0.875 ... 359.1 359.4 359.6 359.9 * month (month) int64 10 Attributes: long_name: Daily sea surface temperature units: Celsius valid_min: -300 valid_max: 4500 _ChunkSizes: [ 1 1 720 1440]
groupby bins#
Breaking down the process#
There is a lot that happens in a single step with groupby
and it can be a lot to take in. One way to mentally situate this process is to think about split-apply-combine.
split-apply-combine breaks down the groupby
process into those three steps:
SPLIT the full data set into groups. Split is related to the question Which variable to group together?
APPLY the aggregation function to the individual groups. Apply is related to the question How do we want to group?
COMBINE the aggregated data into a new dataframe
