Assemble ERA5 normalized data into a set of tf.tensors¶

The data download scripts assemble selected ERA5 data in netCDF files. To use that data efficiently in analysis and modelling it is necessary both to normalize it, and to reformat it as a set of tf.tensors. These have consistent format and resolution and can be reassembled into a tf.data.Dataset` for ML model training.

Script to make the set of tensors. Takes argument –variable, and uses precalculated normalization parameters:

#!/usr/bin/env python

# Make normalized tensors

import os
import sys
import argparse
import iris
import numpy as np
from shutil import rmtree
import zarr

# Supress iris moaning
iris.FUTURE.save_split_attrs = True
iris.FUTURE.datum_support = True

# Supress TensorFlow moaning about cuda - we don't need a GPU for this
# Also the warning message confuses people.
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import tensorflow as tf
import tensorstore as ts
from normalize.ERA5.makeDataset import getDataset
from normalize.ERA5.normalize import match_normal, load_fitted


sDir = os.path.dirname(os.path.realpath(__file__))

parser = argparse.ArgumentParser()
parser.add_argument(
    "--variable",
    help="Variable name",
    type=str,
    required=True,
)
args = parser.parse_args()

# Get the date range from the input zarr array
fn = "%s/DCVAE-Climate/raw_datasets/ERA5/%s_zarr" % (
    os.getenv("SCRATCH"),
    args.variable,
)
input_zarr = zarr.open(fn, mode="r")
AvailableMonths = input_zarr.attrs["AvailableMonths"]


# Create the output zarr array
fn = "%s/DCVAE-Climate/normalized_datasets/ERA5/%s_zarr" % (
    os.getenv("SCRATCH"),
    args.variable,
)
# Delete any previous version
if os.path.exists(fn):
    rmtree(fn)

normalized_zarr = ts.open(
    {
        "driver": "zarr",
        "kvstore": "file://" + fn,
    },
    dtype=ts.float32,
    chunk_layout=ts.ChunkLayout(chunk_shape=[721, 1440, 1]),
    create=True,
    fill_value=np.nan,
    shape=input_zarr.shape,
).result()
# Add date range to array as metadata
# TensorStore doesn't support metadata, so use the underlying zarr array
zarr_ds = zarr.open(fn, mode="r+")
zarr_ds.attrs["AvailableMonths"] = AvailableMonths

# Load the pre-calculated normalisation parameters
fitted = []
for month in range(1, 13):
    cubes = load_fitted(month, variable=args.variable)
    fitted.append([cubes[0].data, cubes[1].data, cubes[2].data])


# Go through raw dataset  and make normalized tensors
trainingData = getDataset(
    args.variable,
    cache=False,
    blur=1.0e-9,
).batch(1)

op = []
for batch in trainingData:
    year = int(batch[1].numpy()[0][0:4])
    month = int(batch[1].numpy()[0][5:7])

    # normalize
    raw = batch[0].numpy().squeeze()
    normalized = match_normal(raw, fitted[month - 1])
    ict = tf.convert_to_tensor(normalized, tf.float32)
    tf.debugging.check_numerics(ict, "Bad data %04d-%02d" % (year, month))

    didx = AvailableMonths["%04d-%02d" % (year, month)]
    op.append(normalized_zarr[:, :, didx].write(ict))

# Ensure writes complete before exiting
for o in op:
    o.result()

Library functions to do the normalization:

# Functions to normalize a data distribution based on SPI
# The aim is to make a normalized distribution that is normally distributed
#  with mean=0.5 and sd=0.2 (so almost all the data is in 0-1)
from scipy.stats import gamma, norm
import numpy as np

import os
import sys
import iris


# Load the pre-calculated fitted values
def load_fitted(month, variable="total_precipitation"):
    shape = iris.load_cube(
        "%s/DCVAE-Climate/normalization/ERA5/%s/shape_m%02d.nc"
        % (os.getenv("SCRATCH"), variable, month),
    )
    location = iris.load_cube(
        "%s/DCVAE-Climate/normalization/ERA5/%s/location_m%02d.nc"
        % (os.getenv("SCRATCH"), variable, month),
    )
    scale = iris.load_cube(
        "%s/DCVAE-Climate/normalization/ERA5/%s/scale_m%02d.nc"
        % (os.getenv("SCRATCH"), variable, month),
    )
    return (shape, location, scale)


# Find the normal variate that matches the gamma cdf
def match_normal(raw, gamma_p, norm_mean=0.5, norm_sd=0.2):
    cdf = gamma.cdf(raw, gamma_p[0], gamma_p[1], gamma_p[2])
    cdf[cdf > 0.99999] = 0.99999  # cdf=0 or 1 causes numerical failure
    cdf[cdf < 0.00001] = 0.00001  # Should fix the gamma fit so this never happens
    spi = norm.ppf(cdf, loc=norm_mean, scale=norm_sd)
    return spi


# Find the original value from the normalized one
def match_original(normalized, gamma_p, norm_mean=0.5, norm_sd=0.2):
    cdf = norm.cdf(normalized, loc=norm_mean, scale=norm_sd)
    original = gamma.ppf(cdf, gamma_p[0], gamma_p[1], gamma_p[2])
    return original


# Normalise a cube (same as match_normal but for cubes)
def normalize_cube(raw, shape, location, scale, norm_mean=0.5, norm_sd=0.2):
    cdf = gamma.cdf(raw.data, shape.data, loc=location.data, scale=scale.data)
    cdf[cdf > 0.99999] = 0.99999  # cdf=0 or 1 causes numerical failure
    cdf[cdf < 0.00001] = 0.00001  # Most of these will be missing data
    spi = norm.ppf(cdf, loc=norm_mean, scale=norm_sd)
    result = raw.copy()
    result.data = np.ma.MaskedArray(spi, np.logical_and(raw.data.mask, shape.data.mask))
    result.data.data[result.data.mask] = 0.0
    return result


# Convert a cube from normalized value to raw
#  (same as match_original but for cubes)
def unnormalize_cube(normalized, shape, location, scale, norm_mean=0.5, norm_sd=0.2):
    cdf = norm.cdf(normalized.data, loc=norm_mean, scale=norm_sd)
    raw = gamma.ppf(cdf, shape.data, location.data, scale.data)
    result = normalized.copy()
    result.data.data = raw
    return result

Library functions to convert between tf.tensor` and iris.cube.cube:

# Utility functions for creating and manipulating normalized tensors

import tensorflow as tf
import numpy as np

from get_data.ERA5 import ERA5_monthly
from utilities import grids
from normalize.ERA5.normalize import (
    normalize_cube,
    unnormalize_cube,
    load_fitted,
)


# Load the data for 1 month
def load_raw(year, month, variable="total_precipitation"):
    raw = ERA5_monthly.load(
        variable=variable,
        year=year,
        month=month,
        grid=grids.E5sCube,
    )
    raw.data.data[raw.data.mask == True] = 0.0
    return raw


# Convert raw cube to normalized tensor
def raw_to_tensor(raw, variable, month):
    (shape, location, scale) = load_fitted(month, variable=variable)
    norm = normalize_cube(raw, shape, location, scale)
    norm.data.data[raw.data.mask == True] = 0.0
    ict = tf.convert_to_tensor(norm.data, tf.float32)
    return ict


# Convert normalized tensor to cube
def tensor_to_cube(tensor):
    cube = grids.E5sCube.copy()
    cube.data = tensor.numpy()
    cube.data = np.ma.MaskedArray(cube.data, cube.data == 0.0)
    return cube


# Convert normalized tensor to raw values
def tensor_to_raw(tensor, variable, month):
    (shape, location, scale) = load_fitted(month, variable=variable)
    cube = tensor_to_cube(tensor)
    raw = unnormalize_cube(cube, shape, location, scale)
    raw.data.data[raw.data.mask == True] = 0.0
    return raw

Assemble ERA5 normalized data into a set of tf.tensors¶

Table of Contents

Get a copy

Found a bug, or have a suggestion?