Making Tensors from archived climate data

First we need to get climate data from its archive in native format. For 20CR2c data this is simple using the IRData package:

#!/usr/bin/env python

# We're going to work with 20CR2c prmsl fields.
# Retrieve the netCDF files from NERSC and store on $SCRATCH

import datetime
import IRData.twcr as twcr

for year in range(1969,2011):
    # 2c is in 1 year batches so month and day don't matter
    dte=datetime.datetime(year,1,1)
    twcr.fetch('prmsl',dte,version='2c')
    twcr.fetch('air.2m',dte,version='2c')
    twcr.fetch('prate',dte,version='2c')
    twcr.fetch('z500',dte,version='2c')
    twcr.fetch('uwnd.10m',dte,version='2c')
    twcr.fetch('vwnd.10m',dte,version='2c')

We can then use IRData again to load a single field as a Iris cube. The cube.data field is a Numpy array that can be directly converted to a tensor, serialised, and written to disc:

#!/usr/bin/env python

# Read in a field from 20CR as an Iris cube.
# Convert it into a TensorFlow tensor.
# Serialise it and store it on $SCRATCH.

import tensorflow as tf
tf.enable_eager_execution()
import numpy

import IRData.twcr as twcr
import iris
import datetime
import argparse
import os

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--year", help="Year",
                    type=int,required=True)
parser.add_argument("--month", help="Integer month",
                    type=int,required=True)
parser.add_argument("--day", help="Day of month",
                    type=int,required=True)
parser.add_argument("--hour", help="Hour of day (0 to 23)",
                    type=int,required=True)
parser.add_argument("--member", help="Ensemble member",
                    default=1,type=int,required=False)
parser.add_argument("--source", help="Data source",
                    default='20CR2c',type=str,required=False)
parser.add_argument("--variable", help="variable name",
                    default='prmsl',type=str,required=False)
parser.add_argument("--test", help="test data, not training",
                    action="store_true")
parser.add_argument("--opfile", help="tf data file name",
                    default=None,
                    type=str,required=False)
args = parser.parse_args()
if args.opfile is None:
    purpose='training'
    if args.test: purpose='test'
    args.opfile=("%s/Machine-Learning-experiments/datasets/%s/%s/%s/%04d-%02d-%02d:%02d.tfd" %
                       (os.getenv('SCRATCH'),args.source,args.variable,purpose,
                        args.year,args.month,args.day,args.hour))

if not os.path.isdir(os.path.dirname(args.opfile)):
    os.makedirs(os.path.dirname(args.opfile))

if args.source=='20CR2c':
    ic=twcr.load(args.variable,datetime.datetime(args.year,args.month,
                                                args.day,args.hour),
                 version='2c')
    ic=ic.extract(iris.Constraint(member=args.member))
    # Normalise to mean=0, sd=1 (approx)
    if args.variable=='prmsl':
        ic.data -= 101325
        ic.data /= 3000
    elif args.variable=='air.2m':
        ic.data -= 280
        ic.data /= 50
    elif args.variable=='prate':
        pass
        # Don't normalise prate until later
        #ic.data = ic.data+numpy.random.uniform(0,numpy.exp(-11),
        #                                           ic.data.shape)
        #ic.data = numpy.log(ic.data)
        #ic.data += 11.5
        #ic.data = numpy.maximum(ic.data,-7)
        #ic.data = numpy.minimum(ic.data,7)

else:
    raise ValueError('Source %s is not supported' % args.source)

# Convert to Tensor
ict=tf.convert_to_tensor(ic.data, numpy.float32)

# Write to tfrecord file
sict=tf.serialize_tensor(ict)
tf.write_file(args.opfile,sict)

If we do that every 5days+6hours over an extended period, we make a large batch of training data, with fields far apart enough in time to be almost uncorrelated (for MSLP), and evenly distributed through the diurnal and annual cycles. We actually need two such datasets, a large one for training models, and a smaller one for testing them. This script makes the list of commands needed to make all the training and test data, and these commands can be run in parallel.

#!/usr/bin/env python

# Make a list of the commands needed to make a few hundred tf data files
#  for training the autoencoder.

# Get one data file every 5 days+6 hours over the selected years 
#  They should be far enough apart to be mostly independent.

# Partition off 1/10 of them to be test data

import os
import datetime

# Function to check if the job is already done for this timepoint
def is_done(year,month,day,hour,group):
    op_file_name=("%s/Machine-Learning-experiments/datasets/20CR2c/prmsl/" +
                  "%s/%04d-%02d-%02d:%02d.tfd") % (
                            os.getenv('SCRATCH'),group,
                            year,month,day,hour)
    if os.path.isfile(op_file_name):
        return True
    return False

f=open("run.txt","w+")

count=1

start_day=datetime.datetime(1969,  1,  1,  0)
end_day  =datetime.datetime(2009, 12, 31, 23)

current_day=start_day
while current_day<=end_day:
    if count%10==0:
        if not is_done(current_day.year,current_day.month,
                       current_day.day,current_day.hour,'test'):
            cmd=("./make_training_tensor.py --year=%d --month=%d" +
                " --day=%d --hour=%d --test \n") % (
                   current_day.year,current_day.month,
                   current_day.day,current_day.hour)
            f.write(cmd)
    else:
        if not is_done(current_day.year,current_day.month,
                       current_day.day,current_day.hour,'training'):
            cmd=("./make_training_tensor.py --year=%d --month=%d" +
                " --day=%d --hour=%d \n") % (
                   current_day.year,current_day.month,
                   current_day.day,current_day.hour)
            f.write(cmd)
    current_day=current_day+datetime.timedelta(hours=126)
    count += 1
    
f.close()

As set-up, this system makes tensors from 20CR2c MSLP fields, but it’s simple to extend it to other sources and variables.