Making Tensors of pseudo-observations¶
To train a system that maps observations to analysis fields, we need to generate an appropriate set of observations (as a tf.tensor) to pair each analysis field. We’ll use pseudo-observations - fake observations made by extracting the values in the analysis field at each of a predefined set of observation locations - as this gets rid of the complications from observation uncertainties and quality control. For this experiment, we are using a fixed observations coverage (from 1916-03-12:06).
We already have the analysis fields, and we can make pseudo-observations from each field by extracting the values at the observation locations, and then export these pseudo-observations as a serialised tensor:
#!/usr/bin/env python
# Read in a field from 20CR as an Iris cube.
# Extract the values at the locations of DWR stations
# Convert them into a TensorFlow tensor.
# Serialise that and store it on $SCRATCH.
import tensorflow as tf
tf.enable_eager_execution()
import numpy
import IRData.twcr as twcr
import iris
import datetime
import argparse
import os
# Get the obs locations for Marxh 12, 1916
obs=twcr.load_observations_fortime(datetime.datetime(1916,3,12,6),
version='2c')
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--year", help="Year",
type=int,required=True)
parser.add_argument("--month", help="Integer month",
type=int,required=True)
parser.add_argument("--day", help="Day of month",
type=int,required=True)
parser.add_argument("--hour", help="Hour of day (0 to 23)",
type=int,required=True)
parser.add_argument("--member", help="Ensemble member",
default=1,type=int,required=False)
parser.add_argument("--source", help="Data source",
default='20CR2c',type=str,required=False)
parser.add_argument("--variable", help="variable name",
default='prmsl',type=str,required=False)
parser.add_argument("--test", help="test data, not training",
action="store_true")
parser.add_argument("--opfile", help="tf data file name",
default=None,
type=str,required=False)
args = parser.parse_args()
if args.opfile is None:
purpose='training'
if args.test: purpose='test'
args.opfile=(("%s/Machine-Learning-experiments/datasets/"+
"obs_1916/%s/%s/%s/%04d-%02d-%02d:%02d.tfd") %
(os.getenv('SCRATCH'),args.source,args.variable,purpose,
args.year,args.month,args.day,args.hour))
if not os.path.isdir(os.path.dirname(args.opfile)):
os.makedirs(os.path.dirname(args.opfile))
if args.source=='20CR2c':
ic=twcr.load(args.variable,datetime.datetime(args.year,args.month,
args.day,args.hour),
version='2c')
ic=ic.extract(iris.Constraint(member=args.member))
#interpolator = iris.analysis.Linear().interpolator(ic,
# ['latitude', 'longitude'])
ensemble=[]
for index, row in obs.iterrows():
ensemble.append(ic.interpolate(
[('latitude',row['Latitude']),
('longitude',row['Longitude'])],
iris.analysis.Linear()).data.item())
ensemble = numpy.array(ensemble, dtype=numpy.float32)
# Normalise to mean=0, sd=1 (approx)
if args.variable=='prmsl':
ensemble -= 101325
ensemble /= 3000
elif args.variable=='air.2m':
ensemble -= 280
ensemble /= 50
elif args.variable=='prate':
pass
# Don't normalise prate until later
else:
raise ValueError('Source %s is not supported' % args.source)
# Convert to Tensor
ict=tf.convert_to_tensor(ensemble, numpy.float32)
# Write to tfrecord file
sict=tf.serialize_tensor(ict)
tf.write_file(args.opfile,sict)
If we do that every 6hours over an extended period, we make a large batch of training data, and it will be possible to match each target field with an obs tensor. We actually need two such datasets, a large one for training models, and a smaller one for testing them. This script makes the list of commands needed to make all the training and test data, and these commands can be run in parallel.
#!/usr/bin/env python
# Make a list of the commands needed to make a few hundred tf data files
# for training the autoencoder.
# Get one data file every 5 days+6 hours over the selected years
# They should be far enough apart to be mostly independent.
# Partition off 1 year to be test data
import os
import datetime
# Function to check if the job is already done for this timepoint
def is_done(year,month,day,hour,group):
op_file_name=("%s/Machine-Learning-experiments/datasets/" +
"obs_1916/20CR2c/prmsl/" +
"%s/%04d-%02d-%02d:%02d.tfd") % (
os.getenv('SCRATCH'),group,
year,month,day,hour)
if os.path.isfile(op_file_name):
return True
return False
f=open("run.txt","w+")
start_day=datetime.datetime(1969, 1, 1, 0)
end_day =datetime.datetime(2008, 12, 31, 23)
current_day=start_day
while current_day<=end_day:
if not is_done(current_day.year,current_day.month,
current_day.day,current_day.hour,'training'):
cmd=("./make_training_tensor.py --year=%d --month=%d" +
" --day=%d --hour=%d --variable=prmsl \n") % (
current_day.year,current_day.month,
current_day.day,current_day.hour)
f.write(cmd)
current_day=current_day+datetime.timedelta(hours=6)
start_day=datetime.datetime(2009, 1, 1, 0)
end_day =datetime.datetime(2009, 12, 31, 23)
current_day=start_day
while current_day<=end_day:
if not is_done(current_day.year,current_day.month,
current_day.day,current_day.hour,'test'):
cmd=("./make_training_tensor.py --year=%d --month=%d" +
" --day=%d --hour=%d --variable=prmsl --test \n") % (
current_day.year,current_day.month,
current_day.day,current_day.hour)
f.write(cmd)
current_day=current_day+datetime.timedelta(hours=6)
f.close()