ERA5 T2m - get the training data

Ideally I’d put in here a script to download the training data to use. This time, however, I cheated and used pre-prepared data provided by a colleague at the Met Office (thanks Robin). So you are on your own for this one.

You can get the data I used from the Copernicus climate Data Store but I don’t have a pre-prepared script for it. (Download the hourly data at 0.25 degree resolution, and average into daily means).

Once you’ve got the data as a local netCDF file you are going to need to make a climatology (our VAE works on anomalies) and a variability climatology (comparing model uncertainty to expected climatological variability tells us where we have skill).

Utility script for data access (imported in most other scripts using this data):

# Load daily T2m data from ERA5

import os
import iris
import pickle


# Convert anomalies to the range 0-1 (ish) by replacing each value
#  with its corresponding CDF value
def ERA5_quantile_normalise(f, quantiles=None):
    if quantiles is None:
        quantiles = pickle.load(
            open(
                "%s/Proxy_20CR/datasets/ERA5/daily_T2m/Quantiles/total_pctl.pkl"
                % os.getenv("SCRATCH"),
                "rb",
            )
        )
    res = f.data.copy()
    wkg = f.data[f.data < quantiles[0]]
    if len(wkg) > 0:
        wkg = 1 - (quantiles[0] - wkg) / (quantiles[1] - quantiles[0])
        res[f.data < quantiles[0]] = wkg
    for pct in range(0, 98):
        wkg = f.data[(f.data >= quantiles[pct]) & (f.data < quantiles[pct + 1])]
        if len(wkg) > 0:
            wkg = pct + 1 + (quantiles[pct + 1] - wkg) / (
                quantiles[pct + 1] - quantiles[pct]
            )
            res[(f.data >= quantiles[pct]) & (f.data < quantiles[pct + 1])] = wkg
    wkg = f.data[f.data > quantiles[98]]
    if len(wkg) > 0:
        wkg = 99 + (wkg - quantiles[98]) / (quantiles[98] - quantiles[97])
        res[f.data > quantiles[98]] = wkg
    r = f.copy()
    r.data = res / 100
    return r


def ERA5_roll_longitude(f):
    f1 = f.extract(iris.Constraint(longitude=lambda cell: cell > 180))
    longs = f1._dim_coords_and_dims[1][0].points - 360
    f1._dim_coords_and_dims[1][0].points = longs
    f2 = f.extract(iris.Constraint(longitude=lambda cell: cell <= 180))
    f = iris.cube.CubeList([f1, f2]).concatenate_cube()
    return f


def ERA5_trim(f, x=1440, y=720):
    xmin = f.coord("longitude").points[-x]
    cx = iris.Constraint(longitude=lambda cell: cell >= xmin)
    ymin = f.coord("latitude").points[y - 1]
    cy = iris.Constraint(latitude=lambda cell: cell >= ymin)
    return f.extract(cx & cy)


def ERA5_load_LS_mask():
    fname = (
        "%s/Proxy_20CR/datasets/ERA5/daily_SST/"
        + "era5_daily_0m_sea_surface_temperature_19790101to20210831.nc"
    ) % os.getenv("SCRATCH")
    ddata = iris.load_cube(
        fname,
        iris.Constraint(
            time=lambda cell: cell.point.year == 1979
            and cell.point.month == 1
            and cell.point.day == 1
        ),
    )
    ddata.data = ddata.data.data  # Remove mask
    ddata.data[ddata.data < 10000] = 0
    ddata.data[ddata.data > 0] = 1
    return ddata


def ERA5_load_T2m(year, month, day):
    fname = (
        "%s/Proxy_20CR/datasets/ERA5/daily_T2m/"
        + "era5_daily_2m_temperature_19790101to20210831.nc"
    ) % os.getenv("SCRATCH")
    ddata = iris.load_cube(
        fname,
        iris.Constraint(
            time=lambda cell: cell.point.year == year
            and cell.point.month == month
            and cell.point.day == day
        ),
    )
    return ddata


def ERA5_load_T2m_climatology(year, month, day):
    if month == 2 and day == 29:
        day = 28
    fname = ("%s/Proxy_20CR/datasets/ERA5/daily_T2m/" + "climatology/%02d/%02d.nc") % (
        os.getenv("SCRATCH"),
        month,
        day,
    )
    ddata = iris.load_cube(fname)
    return ddata


def ERA5_load_T2m_variability_climatology(year, month, day):
    if month == 2 and day == 29:
        day = 28
    fname = (
        "%s/Proxy_20CR/datasets/ERA5/daily_T2m/"
        + "variability_climatology/%02d/%02d.nc"
    ) % (
        os.getenv("SCRATCH"),
        month,
        day,
    )
    ddata = iris.load_cube(fname)
    return ddata

Scripts to make the climatology:

#!/usr/bin/env python

import os
import sys
import iris

sys.path.append("%s/.." % os.path.dirname(__file__))
from ERA5_load import ERA5_load_T2m

# Going to do external parallelism - run this on one core
import dask

dask.config.set(scheduler="single-threaded")

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--month", help="Integer month", type=int, required=True)
parser.add_argument("--day", help="Day of month", type=int, required=True)
args = parser.parse_args()

opfile = "%s/Proxy_20CR/datasets/ERA5/daily_T2m/climatology/%02d/%02d.nc" % (
    os.getenv("SCRATCH"),
    args.month,
    args.day,
)

if not os.path.isdir(os.path.dirname(opfile)):
    os.makedirs(os.path.dirname(opfile))

c = None
for year in range(1981, 2011):
    yd = ERA5_load_T2m(year, args.month, args.day)
    if c is None:
        c = yd.copy()
    else:
        c += yd
c /= 30

iris.save(c, opfile)
#!/usr/bin/env python

# Make a few hundred tf data files
#  for training the GCM models.

# Make an ER5 T2m climatology file for each day

# This script does not run the commands - it makes a list of commands
#  (in the file 'run.txt') which can be run in parallel.

import os
import datetime

# Function to check if the job is already done for this timepoint
def is_done(month, day):
    op_file_name = (
        ("%s/Proxy_20CR/datasets/ERA5/daily_T2m/climatology/%02d/%02d.nc")
    ) % (
        os.getenv("SCRATCH"),
        month,
        day,
    )
    if os.path.isfile(op_file_name):
        return True
    return False


f = open("run.txt", "w+")

start_day = datetime.date(1981, 1, 1)
end_day = datetime.date(1981, 12, 31)

current_day = start_day
while current_day <= end_day:
    if not is_done(
        current_day.month,
        current_day.day,
    ):
        cmd = ("./make_climatology_day.py --month=%d --day=%d \n") % (
            current_day.month,
            current_day.day,
        )
        f.write(cmd)
    current_day = current_day + datetime.timedelta(days=1)

f.close()

Scripts to make the variability climatology:

#!/usr/bin/env python

import os
import sys
import numpy as np
import iris
import iris.analysis.maths

sys.path.append("%s/.." % os.path.dirname(__file__))
from ERA5_load import ERA5_load_T2m
from ERA5_load import ERA5_load_T2m_climatology

# Going to do external parallelism - run this on one core
import dask

dask.config.set(scheduler="single-threaded")

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--month", help="Integer month", type=int, required=True)
parser.add_argument("--day", help="Day of month", type=int, required=True)
args = parser.parse_args()

opfile = (
    "%s/Proxy_20CR/datasets/ERA5/daily_T2m/variability_climatology/%02d/%02d.nc"
    % (
        os.getenv("SCRATCH"),
        args.month,
        args.day,
    )
)

if not os.path.isdir(os.path.dirname(opfile)):
    os.makedirs(os.path.dirname(opfile))

yc = ERA5_load_T2m_climatology(1981, args.month, args.day)
c = None
for year in range(1981, 2011):
    yd = ERA5_load_T2m(year, args.month, args.day)
    yd = yd - yc
    yd = yd * yd
    if c is None:
        c = yd.copy()
    else:
        c += yd
c /= 30
c = iris.analysis.maths.apply_ufunc(np.sqrt, c)

iris.save(c, opfile)
#!/usr/bin/env python

# Make an ER5 T2m variability climatology file for each day

# This script does not run the commands - it makes a list of commands
#  (in the file 'run.txt') which can be run in parallel.

import os
import datetime

# Function to check if the job is already done for this timepoint
def is_done(month, day):
    op_file_name = (
        ("%s/Proxy_20CR/datasets/ERA5/daily_T2m/variability_climatology/%02d/%02d.nc")
    ) % (
        os.getenv("SCRATCH"),
        month,
        day,
    )
    if os.path.isfile(op_file_name):
        return True
    return False


f = open("run.txt", "w+")

start_day = datetime.date(1981, 1, 1)
end_day = datetime.date(1981, 12, 31)

current_day = start_day
while current_day <= end_day:
    if not is_done(
        current_day.month,
        current_day.day,
    ):
        cmd = ("./make_climatology_day.py --month=%d --day=%d \n") % (
            current_day.month,
            current_day.day,
        )
        f.write(cmd)
    current_day = current_day + datetime.timedelta(days=1)

f.close()