ERA5 to HadUK-Grid - convert the data into tf.Tensors

Script to make a tensor from a single day’s data:

#!/usr/bin/env python

# Read in a field of ERA5 Tmax as an Iris cube.
# Convert it into an anomaly
# Regrid it to match the HadUKGrid tensord
# Convert it into a TensorFlow tensor.
# Serialise it and store it on $SCRATCH.

import tensorflow as tf
import numpy as np

# Going to do external parallelism - run this on one core
tf.config.threading.set_inter_op_parallelism_threads(1)
import dask

dask.config.set(scheduler="single-threaded")


import IRData.twcr as twcr
import iris
import datetime
import argparse
import os
import sys

sys.path.append("%s" % os.path.dirname(__file__))
from ERA5_load import ERA5_load_Tmax
from ERA5_load import ERA5_load_Tmax_climatology

sys.path.append("%s/../prepare_training_tensors_HUKG_Tmax" % os.path.dirname(__file__))
from HUKG_load_tmax import HUKG_load_tmax
from HUKG_load_tmax import HUKG_trim

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--year", help="Year", type=int, required=True)
parser.add_argument("--month", help="Integer month", type=int, required=True)
parser.add_argument("--day", help="Day of month", type=int, required=True)
parser.add_argument("--test", help="test data, not training", action="store_true")
parser.add_argument(
    "--opfile", help="tf data file name", default=None, type=str, required=False
)
args = parser.parse_args()
if args.opfile is None:
    purpose = "training"
    if args.test:
        purpose = "test"
    args.opfile = ("%s/Proxy_20CR/datasets/" + "%s/%s/%s/%04d-%02d-%02d.tfd") % (
        os.getenv("SCRATCH"),
        "ERA5",
        "daily_Tmax",
        purpose,
        args.year,
        args.month,
        args.day,
    )

if not os.path.isdir(os.path.dirname(args.opfile)):
    os.makedirs(os.path.dirname(args.opfile))

# Load and anomalise data
t = ERA5_load_Tmax(args.year, args.month, args.day)
c = ERA5_load_Tmax_climatology(args.year, args.month, args.day)
t = t - c
# Rescale to range 0-1 (approx)
t /= 10
t += 0.5

# Convert it to HadUKGrid grid
g = HUKG_load_tmax(args.year, args.month, args.day)
t = t.regrid(g, iris.analysis.Linear())
# discard bottom left to make sizes multiply divisible by 2
t = HUKG_trim(t)

# Convert to Tensor
ict = tf.convert_to_tensor(t.data, np.float32)

# Write to file
sict = tf.io.serialize_tensor(ict)
tf.io.write_file(args.opfile, sict)

Script to make a tensor for every day in a 40-year period (runs the above script many times):

#!/usr/bin/env python

# Make a few thousand tf data files
#  for training the VAE models.

# Get one data file every 5 days

# Partition off 1/10 of them to be test data

# This script does not run the commands - it makes a list of commands
#  (in the file 'run.txt') which can be run in parallel.

import os
import datetime

# Function to check if the job is already done for this timepoint
def is_done(year, month, day, group):
    op_file_name = (
        ("%s/Proxy_20CR/datasets/ERA5/daily_Tmax/" + "%s/%04d-%02d-%02d.tfd")
    ) % (
        os.getenv("SCRATCH"),
        group,
        year,
        month,
        day,
    )
    if os.path.isfile(op_file_name):
        return True
    return False


f = open("run.txt", "w+")

start_day = datetime.date(1979, 1, 1)
end_day = datetime.date(2020, 8, 31)

current_day = start_day
count = 1
while current_day <= end_day:
    if count % 10 == 8:  # To match with hadUKgrid test cases 
        if not is_done(
            current_day.year,
            current_day.month,
            current_day.day,
            "test",
        ):
            cmd = (
                "./make_training_tensor.py --year=%d --month=%d --day=%d --test \n"
            ) % (
                current_day.year,
                current_day.month,
                current_day.day,
            )
            f.write(cmd)
    else:
        if not is_done(
            current_day.year,
            current_day.month,
            current_day.day,
            "training",
        ):
            cmd = ("./make_training_tensor.py --year=%d --month=%d --day=%d \n") % (
                current_day.year,
                current_day.month,
                current_day.day,
            )
            f.write(cmd)
    current_day = current_day + datetime.timedelta(days=1)
    count += 1

f.close()