Making Tensors of pseudo-observations

To train a system that maps observations to analysis fields, we need to generate an appropriate set of observations (as a tf.tensor) to pair each analysis field. We’ll use pseudo-observations - fake observations made by extracting the values in the analysis field at each of a predefined set of observation locations - as this gets rid of the complications from observation uncertainties and quality control. For this experiment, we are using a fixed observations coverage (from 1916-03-12:06).

We already have the analysis fields, and we can make pseudo-observations from each field by extracting the values at the observation locations, and then export these pseudo-observations as a serialised tensor:

#!/usr/bin/env python

# Read in a field from 20CR as an Iris cube.
# Extract the values at the locations of DWR stations
# Convert them into a TensorFlow tensor.
# Serialise that and store it on $SCRATCH.

import tensorflow as tf
tf.enable_eager_execution()
import numpy

import IRData.twcr as twcr
import iris
import datetime
import argparse
import os

# Get the obs locations for Marxh 12, 1916
obs=twcr.load_observations_fortime(datetime.datetime(1916,3,12,6),
                                   version='2c')

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--year", help="Year",
                    type=int,required=True)
parser.add_argument("--month", help="Integer month",
                    type=int,required=True)
parser.add_argument("--day", help="Day of month",
                    type=int,required=True)
parser.add_argument("--hour", help="Hour of day (0 to 23)",
                    type=int,required=True)
parser.add_argument("--member", help="Ensemble member",
                    default=1,type=int,required=False)
parser.add_argument("--source", help="Data source",
                    default='20CR2c',type=str,required=False)
parser.add_argument("--variable", help="variable name",
                    default='prmsl',type=str,required=False)
parser.add_argument("--test", help="test data, not training",
                    action="store_true")
parser.add_argument("--opfile", help="tf data file name",
                    default=None,
                    type=str,required=False)
args = parser.parse_args()
if args.opfile is None:
    purpose='training'
    if args.test: purpose='test'
    args.opfile=(("%s/Machine-Learning-experiments/datasets/"+
                  "obs_1916/%s/%s/%s/%04d-%02d-%02d:%02d.tfd") %
                       (os.getenv('SCRATCH'),args.source,args.variable,purpose,
                        args.year,args.month,args.day,args.hour))

if not os.path.isdir(os.path.dirname(args.opfile)):
    os.makedirs(os.path.dirname(args.opfile))

if args.source=='20CR2c':
    ic=twcr.load(args.variable,datetime.datetime(args.year,args.month,
                                                args.day,args.hour),
                 version='2c')
    ic=ic.extract(iris.Constraint(member=args.member))

    #interpolator = iris.analysis.Linear().interpolator(ic, 
    #                               ['latitude', 'longitude'])
    ensemble=[]
    for index, row in obs.iterrows():
        ensemble.append(ic.interpolate(
                        [('latitude',row['Latitude']),
                         ('longitude',row['Longitude'])],
                        iris.analysis.Linear()).data.item())
    ensemble = numpy.array(ensemble, dtype=numpy.float32)
 
    # Normalise to mean=0, sd=1 (approx)
    if args.variable=='prmsl':
        ensemble -= 101325
        ensemble /= 3000
    elif args.variable=='air.2m':
        ensemble -= 280
        ensemble /= 50
    elif args.variable=='prate':
        pass
        # Don't normalise prate until later

else:
    raise ValueError('Source %s is not supported' % args.source)

# Convert to Tensor
ict=tf.convert_to_tensor(ensemble, numpy.float32)

# Write to tfrecord file
sict=tf.serialize_tensor(ict)
tf.write_file(args.opfile,sict)

If we do that every 6hours over an extended period, we make a large batch of training data, and it will be possible to match each target field with an obs tensor. We actually need two such datasets, a large one for training models, and a smaller one for testing them. This script makes the list of commands needed to make all the training and test data, and these commands can be run in parallel.

#!/usr/bin/env python

# Make a list of the commands needed to make a few hundred tf data files
#  for training the autoencoder.

# Get one data file every 5 days+6 hours over the selected years 
#  They should be far enough apart to be mostly independent.

# Partition off 1 year to be test data

import os
import datetime

# Function to check if the job is already done for this timepoint
def is_done(year,month,day,hour,group):
    op_file_name=("%s/Machine-Learning-experiments/datasets/" +
                  "obs_1916/20CR2c/prmsl/" +
                  "%s/%04d-%02d-%02d:%02d.tfd") % (
                            os.getenv('SCRATCH'),group,
                            year,month,day,hour)
    if os.path.isfile(op_file_name):
        return True
    return False

f=open("run.txt","w+")

start_day=datetime.datetime(1969,  1,  1,  0)
end_day  =datetime.datetime(2008, 12, 31, 23)

current_day=start_day
while current_day<=end_day:
    if not is_done(current_day.year,current_day.month,
                   current_day.day,current_day.hour,'training'):
        cmd=("./make_training_tensor.py --year=%d --month=%d" +
            " --day=%d --hour=%d --variable=prmsl \n") % (
               current_day.year,current_day.month,
               current_day.day,current_day.hour)
        f.write(cmd)
    current_day=current_day+datetime.timedelta(hours=6)
 
start_day=datetime.datetime(2009,  1,  1,  0)
end_day  =datetime.datetime(2009, 12, 31, 23)

current_day=start_day
while current_day<=end_day:
    if not is_done(current_day.year,current_day.month,
                   current_day.day,current_day.hour,'test'):
        cmd=("./make_training_tensor.py --year=%d --month=%d" +
            " --day=%d --hour=%d --variable=prmsl --test \n") % (
               current_day.year,current_day.month,
               current_day.day,current_day.hour)
        f.write(cmd)
    current_day=current_day+datetime.timedelta(hours=6)
   
f.close()