Assemble ERA5 raw data into a set of tf.tensors¶
The data download scripts assemble selected ERA5 data in netCDF files. To use that data efficiently in analysis and modelling it is necessary to reformat it as a set of tf.tensors. These have consistent format and resolution and can be reassembled into a tf.data.Dataset` for ML model training.
So for each month in the training period, for each variable (2m_temperature, mean_sea_level_pressure, total_precipitation), we read in the data from netCDF, regrid it to a common grid, and save it as a tf.tensor.
The script make_all_raw_tensors.sh creates a set of commands to make all the tensors. The script outputs a list of other scripts (one per year, month, variable). Running all the output scripts will create the set of tensors. (Use GNU parallel to run the scripts efficiently - or submit them as jobs to a cluster).
#!/usr/bin/bash
# Make all the raw tensors
# Requires downloaded data
(cd ERA5 && ./make_all_tensors.py --variable=2m_temperature)
(cd ERA5 && ./make_all_tensors.py --variable=sea_surface_temperature)
(cd ERA5 && ./make_all_tensors.py --variable=mean_sea_level_pressure)
(cd ERA5 && ./make_all_tensors.py --variable=total_precipitation)
Other scripts used by that main script:
Script to make the set of tensors for one variable. Takes argument –variable:
#!/usr/bin/env python
# Make raw data tensors for normalization
import os
import argparse
sDir = os.path.dirname(os.path.realpath(__file__))
parser = argparse.ArgumentParser()
parser.add_argument(
"--variable",
help="Variable name",
type=str,
required=True,
)
args = parser.parse_args()
def is_done(year, month, variable):
fn = "%s/DCVAE-Climate/raw_datasets/ERA5/%s/%04d-%02d.tfd" % (
os.getenv("SCRATCH"),
variable,
year,
month,
)
if os.path.exists(fn):
return True
return False
count = 0
for year in range(1940, 2022):
for month in range(1, 13):
if is_done(year, month, args.variable):
continue
cmd = "%s/make_training_tensor.py --year=%04d --month=%02d --variable=%s" % (
sDir,
year,
month,
args.variable,
)
print(cmd)
Calls another script to make a single tensor:
#!/usr/bin/env python
# Read in monthly variable from ERA5 - regrid to model resolution
# Convert into a TensorFlow tensor.
# Serialise and store on $SCRATCH.
import os
import sys
# Supress TensorFlow moaning about cuda - we don't need a GPU for this
# Also the warning message confuses people.
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
import tensorflow as tf
import dask
# Going to do external parallelism - run this on one core
tf.config.threading.set_inter_op_parallelism_threads(1)
dask.config.set(scheduler="single-threaded")
from tensor_utils import load_raw, raw_to_tensor
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--year", help="Year", type=int, required=True)
parser.add_argument("--month", help="Integer month", type=int, required=True)
parser.add_argument("--variable", help="Variable name", type=str, required=True)
parser.add_argument(
"--opfile", help="tf data file name", default=None, type=str, required=False
)
args = parser.parse_args()
if args.opfile is None:
args.opfile = ("%s/DCVAE-Climate/raw_datasets/ERA5/%s/%04d-%02d.tfd") % (
os.getenv("SCRATCH"),
args.variable,
args.year,
args.month,
)
if not os.path.isdir(os.path.dirname(args.opfile)):
os.makedirs(os.path.dirname(args.opfile))
# Load and standardise data
qd = load_raw(args.year, args.month, variable=args.variable)
ict = raw_to_tensor(qd)
# Write to file
sict = tf.io.serialize_tensor(ict)
tf.io.write_file(args.opfile, sict)
Library functions to convert between tf.tensor and iris.cube.cube:
# Utility functions for creating and manipulating raw tensors
import numpy as np
import tensorflow as tf
from get_data.ERA5 import ERA5_monthly
from utilities import grids
# Load the data for 1 month (on the standard cube).
def load_raw(year, month, member=None, variable="total_precipitation"):
raw = ERA5_monthly.load(
variable=variable,
year=year,
month=month,
grid=grids.E5sCube,
)
raw.data.data[raw.data.mask == True] = np.nan
return raw
# Convert raw cube to tensor
def raw_to_tensor(raw):
ict = tf.convert_to_tensor(raw.data, tf.float32)
return ict
# Convert tensor to cube
def tensor_to_cube(tensor):
cube = grids.E5sCube.copy()
cube.data = tensor.numpy()
cube.data = np.ma.MaskedArray(cube.data, np.isnan(cube.data))
return cube