ERA5 T2m - get the training data¶
Ideally I’d put in here a script to download the training data to use. This time, however, I cheated and used pre-prepared data provided by a colleague at the Met Office (thanks Robin). So you are on your own for this one.
You can get the data I used from the Copernicus climate Data Store but I don’t have a pre-prepared script for it. (Download the hourly data at 0.25 degree resolution, and average into daily means).
Once you’ve got the data as a local netCDF file you are going to need to make a climatology (our VAE works on anomalies) and a variability climatology (comparing model uncertainty to expected climatological variability tells us where we have skill).
Utility script for data access (imported in most other scripts using this data):
# Load daily T2m data from ERA5
import os
import iris
import pickle
# Convert anomalies to the range 0-1 (ish) by replacing each value
# with its corresponding CDF value
def ERA5_quantile_normalise(f, quantiles=None):
if quantiles is None:
quantiles = pickle.load(
open(
"%s/Proxy_20CR/datasets/ERA5/daily_T2m/Quantiles/total_pctl.pkl"
% os.getenv("SCRATCH"),
"rb",
)
)
res = f.data.copy()
wkg = f.data[f.data < quantiles[0]]
if len(wkg) > 0:
wkg = 1 - (quantiles[0] - wkg) / (quantiles[1] - quantiles[0])
res[f.data < quantiles[0]] = wkg
for pct in range(0, 98):
wkg = f.data[(f.data >= quantiles[pct]) & (f.data < quantiles[pct + 1])]
if len(wkg) > 0:
wkg = pct + 1 + (quantiles[pct + 1] - wkg) / (
quantiles[pct + 1] - quantiles[pct]
)
res[(f.data >= quantiles[pct]) & (f.data < quantiles[pct + 1])] = wkg
wkg = f.data[f.data > quantiles[98]]
if len(wkg) > 0:
wkg = 99 + (wkg - quantiles[98]) / (quantiles[98] - quantiles[97])
res[f.data > quantiles[98]] = wkg
r = f.copy()
r.data = res / 100
return r
def ERA5_roll_longitude(f):
f1 = f.extract(iris.Constraint(longitude=lambda cell: cell > 180))
longs = f1._dim_coords_and_dims[1][0].points - 360
f1._dim_coords_and_dims[1][0].points = longs
f2 = f.extract(iris.Constraint(longitude=lambda cell: cell <= 180))
f = iris.cube.CubeList([f1, f2]).concatenate_cube()
return f
def ERA5_trim(f, x=1440, y=720):
xmin = f.coord("longitude").points[-x]
cx = iris.Constraint(longitude=lambda cell: cell >= xmin)
ymin = f.coord("latitude").points[y - 1]
cy = iris.Constraint(latitude=lambda cell: cell >= ymin)
return f.extract(cx & cy)
def ERA5_load_LS_mask():
fname = (
"%s/Proxy_20CR/datasets/ERA5/daily_SST/"
+ "era5_daily_0m_sea_surface_temperature_19790101to20210831.nc"
) % os.getenv("SCRATCH")
ddata = iris.load_cube(
fname,
iris.Constraint(
time=lambda cell: cell.point.year == 1979
and cell.point.month == 1
and cell.point.day == 1
),
)
ddata.data = ddata.data.data # Remove mask
ddata.data[ddata.data < 10000] = 0
ddata.data[ddata.data > 0] = 1
return ddata
def ERA5_load_T2m(year, month, day):
fname = (
"%s/Proxy_20CR/datasets/ERA5/daily_T2m/"
+ "era5_daily_2m_temperature_19790101to20210831.nc"
) % os.getenv("SCRATCH")
ddata = iris.load_cube(
fname,
iris.Constraint(
time=lambda cell: cell.point.year == year
and cell.point.month == month
and cell.point.day == day
),
)
return ddata
def ERA5_load_T2m_climatology(year, month, day):
if month == 2 and day == 29:
day = 28
fname = ("%s/Proxy_20CR/datasets/ERA5/daily_T2m/" + "climatology/%02d/%02d.nc") % (
os.getenv("SCRATCH"),
month,
day,
)
ddata = iris.load_cube(fname)
return ddata
def ERA5_load_T2m_variability_climatology(year, month, day):
if month == 2 and day == 29:
day = 28
fname = (
"%s/Proxy_20CR/datasets/ERA5/daily_T2m/"
+ "variability_climatology/%02d/%02d.nc"
) % (
os.getenv("SCRATCH"),
month,
day,
)
ddata = iris.load_cube(fname)
return ddata
Scripts to make the climatology:
#!/usr/bin/env python
import os
import sys
import iris
sys.path.append("%s/.." % os.path.dirname(__file__))
from ERA5_load import ERA5_load_T2m
# Going to do external parallelism - run this on one core
import dask
dask.config.set(scheduler="single-threaded")
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--month", help="Integer month", type=int, required=True)
parser.add_argument("--day", help="Day of month", type=int, required=True)
args = parser.parse_args()
opfile = "%s/Proxy_20CR/datasets/ERA5/daily_T2m/climatology/%02d/%02d.nc" % (
os.getenv("SCRATCH"),
args.month,
args.day,
)
if not os.path.isdir(os.path.dirname(opfile)):
os.makedirs(os.path.dirname(opfile))
c = None
for year in range(1981, 2011):
yd = ERA5_load_T2m(year, args.month, args.day)
if c is None:
c = yd.copy()
else:
c += yd
c /= 30
iris.save(c, opfile)
#!/usr/bin/env python
# Make a few hundred tf data files
# for training the GCM models.
# Make an ER5 T2m climatology file for each day
# This script does not run the commands - it makes a list of commands
# (in the file 'run.txt') which can be run in parallel.
import os
import datetime
# Function to check if the job is already done for this timepoint
def is_done(month, day):
op_file_name = (
("%s/Proxy_20CR/datasets/ERA5/daily_T2m/climatology/%02d/%02d.nc")
) % (
os.getenv("SCRATCH"),
month,
day,
)
if os.path.isfile(op_file_name):
return True
return False
f = open("run.txt", "w+")
start_day = datetime.date(1981, 1, 1)
end_day = datetime.date(1981, 12, 31)
current_day = start_day
while current_day <= end_day:
if not is_done(
current_day.month,
current_day.day,
):
cmd = ("./make_climatology_day.py --month=%d --day=%d \n") % (
current_day.month,
current_day.day,
)
f.write(cmd)
current_day = current_day + datetime.timedelta(days=1)
f.close()
Scripts to make the variability climatology:
#!/usr/bin/env python
import os
import sys
import numpy as np
import iris
import iris.analysis.maths
sys.path.append("%s/.." % os.path.dirname(__file__))
from ERA5_load import ERA5_load_T2m
from ERA5_load import ERA5_load_T2m_climatology
# Going to do external parallelism - run this on one core
import dask
dask.config.set(scheduler="single-threaded")
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--month", help="Integer month", type=int, required=True)
parser.add_argument("--day", help="Day of month", type=int, required=True)
args = parser.parse_args()
opfile = (
"%s/Proxy_20CR/datasets/ERA5/daily_T2m/variability_climatology/%02d/%02d.nc"
% (
os.getenv("SCRATCH"),
args.month,
args.day,
)
)
if not os.path.isdir(os.path.dirname(opfile)):
os.makedirs(os.path.dirname(opfile))
yc = ERA5_load_T2m_climatology(1981, args.month, args.day)
c = None
for year in range(1981, 2011):
yd = ERA5_load_T2m(year, args.month, args.day)
yd = yd - yc
yd = yd * yd
if c is None:
c = yd.copy()
else:
c += yd
c /= 30
c = iris.analysis.maths.apply_ufunc(np.sqrt, c)
iris.save(c, opfile)
#!/usr/bin/env python
# Make an ER5 T2m variability climatology file for each day
# This script does not run the commands - it makes a list of commands
# (in the file 'run.txt') which can be run in parallel.
import os
import datetime
# Function to check if the job is already done for this timepoint
def is_done(month, day):
op_file_name = (
("%s/Proxy_20CR/datasets/ERA5/daily_T2m/variability_climatology/%02d/%02d.nc")
) % (
os.getenv("SCRATCH"),
month,
day,
)
if os.path.isfile(op_file_name):
return True
return False
f = open("run.txt", "w+")
start_day = datetime.date(1981, 1, 1)
end_day = datetime.date(1981, 12, 31)
current_day = start_day
while current_day <= end_day:
if not is_done(
current_day.month,
current_day.day,
):
cmd = ("./make_climatology_day.py --month=%d --day=%d \n") % (
current_day.month,
current_day.day,
)
f.write(cmd)
current_day = current_day + datetime.timedelta(days=1)
f.close()