You can run this notebook in a live session or view it on Github.
Calculating Seasonal Averages from Time Series of Monthly Means¶
Author: Joe Hamman
The data used for this example can be found in the xarray-data repository. You may need to change the path to rasm.nc
below.
Suppose we have a netCDF or xarray.Dataset
of monthly mean data and we want to calculate the seasonal average. To do this properly, we need to calculate the weighted average considering that each month has a different number of days.
[1]:
%matplotlib inline
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
Open the Dataset
¶
[2]:
ds = xr.tutorial.open_dataset("rasm").load()
ds
---------------------------------------------------------------------------
gaierror Traceback (most recent call last)
File /usr/lib/python3/dist-packages/urllib3/connection.py:198, in HTTPConnection._new_conn(self)
197 try:
--> 198 sock = connection.create_connection(
199 (self._dns_host, self.port),
200 self.timeout,
201 source_address=self.source_address,
202 socket_options=self.socket_options,
203 )
204 except socket.gaierror as e:
File /usr/lib/python3/dist-packages/urllib3/util/connection.py:60, in create_connection(address, timeout, source_address, socket_options)
58 raise LocationParseError(f"'{host}', label empty or too long") from None
---> 60 for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
61 af, socktype, proto, canonname, sa = res
File /usr/lib/python3.13/socket.py:977, in getaddrinfo(host, port, family, type, proto, flags)
976 addrlist = []
--> 977 for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
978 af, socktype, proto, canonname, sa = res
gaierror: [Errno -3] Temporary failure in name resolution
The above exception was the direct cause of the following exception:
NameResolutionError Traceback (most recent call last)
File /usr/lib/python3/dist-packages/urllib3/connectionpool.py:787, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)
786 # Make the request on the HTTPConnection object
--> 787 response = self._make_request(
788 conn,
789 method,
790 url,
791 timeout=timeout_obj,
792 body=body,
793 headers=headers,
794 chunked=chunked,
795 retries=retries,
796 response_conn=response_conn,
797 preload_content=preload_content,
798 decode_content=decode_content,
799 **response_kw,
800 )
802 # Everything went great!
File /usr/lib/python3/dist-packages/urllib3/connectionpool.py:488, in HTTPConnectionPool._make_request(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)
487 new_e = _wrap_proxy_error(new_e, conn.proxy.scheme)
--> 488 raise new_e
490 # conn.request() calls http.client.*.request, not the method in
491 # urllib3.request. It also calls makefile (recv) on the socket.
File /usr/lib/python3/dist-packages/urllib3/connectionpool.py:464, in HTTPConnectionPool._make_request(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)
463 try:
--> 464 self._validate_conn(conn)
465 except (SocketTimeout, BaseSSLError) as e:
File /usr/lib/python3/dist-packages/urllib3/connectionpool.py:1093, in HTTPSConnectionPool._validate_conn(self, conn)
1092 if conn.is_closed:
-> 1093 conn.connect()
1095 # TODO revise this, see https://github.com/urllib3/urllib3/issues/2791
File /usr/lib/python3/dist-packages/urllib3/connection.py:704, in HTTPSConnection.connect(self)
703 sock: socket.socket | ssl.SSLSocket
--> 704 self.sock = sock = self._new_conn()
705 server_hostname: str = self.host
File /usr/lib/python3/dist-packages/urllib3/connection.py:205, in HTTPConnection._new_conn(self)
204 except socket.gaierror as e:
--> 205 raise NameResolutionError(self.host, self, e) from e
206 except SocketTimeout as e:
NameResolutionError: <urllib3.connection.HTTPSConnection object at 0x7f2296fcdd30>: Failed to resolve 'github.com' ([Errno -3] Temporary failure in name resolution)
The above exception was the direct cause of the following exception:
MaxRetryError Traceback (most recent call last)
File /usr/lib/python3/dist-packages/requests/adapters.py:667, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
666 try:
--> 667 resp = conn.urlopen(
668 method=request.method,
669 url=url,
670 body=request.body,
671 headers=request.headers,
672 redirect=False,
673 assert_same_host=False,
674 preload_content=False,
675 decode_content=False,
676 retries=self.max_retries,
677 timeout=timeout,
678 chunked=chunked,
679 )
681 except (ProtocolError, OSError) as err:
File /usr/lib/python3/dist-packages/urllib3/connectionpool.py:841, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)
839 new_e = ProtocolError("Connection aborted.", new_e)
--> 841 retries = retries.increment(
842 method, url, error=new_e, _pool=self, _stacktrace=sys.exc_info()[2]
843 )
844 retries.sleep()
File /usr/lib/python3/dist-packages/urllib3/util/retry.py:519, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
518 reason = error or ResponseError(cause)
--> 519 raise MaxRetryError(_pool, url, reason) from reason # type: ignore[arg-type]
521 log.debug("Incremented Retry for (url='%s'): %r", url, new_retry)
MaxRetryError: HTTPSConnectionPool(host='github.com', port=443): Max retries exceeded with url: /pydata/xarray-data/raw/master/rasm.nc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f2296fcdd30>: Failed to resolve 'github.com' ([Errno -3] Temporary failure in name resolution)"))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
Cell In[2], line 1
----> 1 ds = xr.tutorial.open_dataset("rasm").load()
2 ds
File /usr/lib/python3/dist-packages/xarray/tutorial.py:167, in open_dataset(name, cache, cache_dir, engine, **kws)
164 downloader = pooch.HTTPDownloader(headers=headers)
166 # retrieve the file
--> 167 filepath = pooch.retrieve(
168 url=url, known_hash=None, path=cache_dir, downloader=downloader
169 )
170 ds = _open_dataset(filepath, engine=engine, **kws)
171 if not cache:
File /usr/lib/python3/dist-packages/pooch/core.py:239, in retrieve(url, known_hash, fname, path, processor, downloader, progressbar)
236 if downloader is None:
237 downloader = choose_downloader(url, progressbar=progressbar)
--> 239 stream_download(url, full_path, known_hash, downloader, pooch=None)
241 if known_hash is None:
242 get_logger().info(
243 "SHA256 hash of downloaded file: %s\n"
244 "Use this value as the 'known_hash' argument of 'pooch.retrieve'"
(...)
247 file_hash(str(full_path)),
248 )
File /usr/lib/python3/dist-packages/pooch/core.py:807, in stream_download(url, fname, known_hash, downloader, pooch, retry_if_failed)
803 try:
804 # Stream the file to a temporary so that we can safely check its
805 # hash before overwriting the original.
806 with temporary_file(path=str(fname.parent)) as tmp:
--> 807 downloader(url, tmp, pooch)
808 hash_matches(tmp, known_hash, strict=True, source=str(fname.name))
809 shutil.move(tmp, str(fname))
File /usr/lib/python3/dist-packages/pooch/downloaders.py:220, in HTTPDownloader.__call__(self, url, output_file, pooch, check_only)
218 # pylint: enable=consider-using-with
219 try:
--> 220 response = requests.get(url, timeout=timeout, **kwargs)
221 response.raise_for_status()
222 content = response.iter_content(chunk_size=self.chunk_size)
File /usr/lib/python3/dist-packages/requests/api.py:73, in get(url, params, **kwargs)
62 def get(url, params=None, **kwargs):
63 r"""Sends a GET request.
64
65 :param url: URL for the new :class:`Request` object.
(...)
70 :rtype: requests.Response
71 """
---> 73 return request("get", url, params=params, **kwargs)
File /usr/lib/python3/dist-packages/requests/api.py:59, in request(method, url, **kwargs)
55 # By using the 'with' statement we are sure the session is closed, thus we
56 # avoid leaving sockets open which can trigger a ResourceWarning in some
57 # cases, and look like a memory leak in others.
58 with sessions.Session() as session:
---> 59 return session.request(method=method, url=url, **kwargs)
File /usr/lib/python3/dist-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
584 send_kwargs = {
585 "timeout": timeout,
586 "allow_redirects": allow_redirects,
587 }
588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
591 return resp
File /usr/lib/python3/dist-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
700 start = preferred_clock()
702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
705 # Total elapsed time of the request (approximately)
706 elapsed = preferred_clock() - start
File /usr/lib/python3/dist-packages/requests/adapters.py:700, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
696 if isinstance(e.reason, _SSLError):
697 # This branch is for urllib3 v1.22 and later.
698 raise SSLError(e, request=request)
--> 700 raise ConnectionError(e, request=request)
702 except ClosedPoolError as e:
703 raise ConnectionError(e, request=request)
ConnectionError: HTTPSConnectionPool(host='github.com', port=443): Max retries exceeded with url: /pydata/xarray-data/raw/master/rasm.nc (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f2296fcdd30>: Failed to resolve 'github.com' ([Errno -3] Temporary failure in name resolution)"))
Now for the heavy lifting:¶
We first have to come up with the weights, - calculate the month length for each monthly data record - calculate weights using groupby('time.season')
Finally, we just need to multiply our weights by the Dataset
and sum along the time dimension. Creating a DataArray
for the month length is as easy as using the days_in_month
accessor on the time coordinate. The calendar type, in this case 'noleap'
, is automatically considered in this operation.
[3]:
month_length = ds.time.dt.days_in_month
month_length
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[3], line 1
----> 1 month_length = ds.time.dt.days_in_month
2 month_length
NameError: name 'ds' is not defined
[4]:
# Calculate the weights by grouping by 'time.season'.
weights = (
month_length.groupby("time.season") / month_length.groupby("time.season").sum()
)
# Test that the sum of the weights for each season is 1.0
np.testing.assert_allclose(weights.groupby("time.season").sum().values, np.ones(4))
# Calculate the weighted average
ds_weighted = (ds * weights).groupby("time.season").sum(dim="time")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 3
1 # Calculate the weights by grouping by 'time.season'.
2 weights = (
----> 3 month_length.groupby("time.season") / month_length.groupby("time.season").sum()
4 )
6 # Test that the sum of the weights for each season is 1.0
7 np.testing.assert_allclose(weights.groupby("time.season").sum().values, np.ones(4))
NameError: name 'month_length' is not defined
[5]:
ds_weighted
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 1
----> 1 ds_weighted
NameError: name 'ds_weighted' is not defined
[6]:
# only used for comparisons
ds_unweighted = ds.groupby("time.season").mean("time")
ds_diff = ds_weighted - ds_unweighted
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[6], line 2
1 # only used for comparisons
----> 2 ds_unweighted = ds.groupby("time.season").mean("time")
3 ds_diff = ds_weighted - ds_unweighted
NameError: name 'ds' is not defined
[7]:
# Quick plot to show the results
notnull = pd.notnull(ds_unweighted["Tair"][0])
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(14, 12))
for i, season in enumerate(("DJF", "MAM", "JJA", "SON")):
ds_weighted["Tair"].sel(season=season).where(notnull).plot.pcolormesh(
ax=axes[i, 0],
vmin=-30,
vmax=30,
cmap="Spectral_r",
add_colorbar=True,
extend="both",
)
ds_unweighted["Tair"].sel(season=season).where(notnull).plot.pcolormesh(
ax=axes[i, 1],
vmin=-30,
vmax=30,
cmap="Spectral_r",
add_colorbar=True,
extend="both",
)
ds_diff["Tair"].sel(season=season).where(notnull).plot.pcolormesh(
ax=axes[i, 2],
vmin=-0.1,
vmax=0.1,
cmap="RdBu_r",
add_colorbar=True,
extend="both",
)
axes[i, 0].set_ylabel(season)
axes[i, 1].set_ylabel("")
axes[i, 2].set_ylabel("")
for ax in axes.flat:
ax.axes.get_xaxis().set_ticklabels([])
ax.axes.get_yaxis().set_ticklabels([])
ax.axes.axis("tight")
ax.set_xlabel("")
axes[0, 0].set_title("Weighted by DPM")
axes[0, 1].set_title("Equal Weighting")
axes[0, 2].set_title("Difference")
plt.tight_layout()
fig.suptitle("Seasonal Surface Air Temperature", fontsize=16, y=1.02)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[7], line 2
1 # Quick plot to show the results
----> 2 notnull = pd.notnull(ds_unweighted["Tair"][0])
4 fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(14, 12))
5 for i, season in enumerate(("DJF", "MAM", "JJA", "SON")):
NameError: name 'ds_unweighted' is not defined
[8]:
# Wrap it into a simple function
def season_mean(ds, calendar="standard"):
# Make a DataArray with the number of days in each month, size = len(time)
month_length = ds.time.dt.days_in_month
# Calculate the weights by grouping by 'time.season'
weights = (
month_length.groupby("time.season") / month_length.groupby("time.season").sum()
)
# Test that the sum of the weights for each season is 1.0
np.testing.assert_allclose(weights.groupby("time.season").sum().values, np.ones(4))
# Calculate the weighted average
return (ds * weights).groupby("time.season").sum(dim="time")