Argentine Daily Weather Reports 1902: Conversion scripts

The original raw data needed to be converted into SEF files:

Convert the data for one station

#!/usr/bin/env python

# Make a SEF file for a single station from the raw data

import os
import sys
import pandas
import datetime
import copy
import SEF

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--id", help="Station identifier",
                    type=str,required=True)
args = parser.parse_args()

# Find the directory with this script in
try:
    bindir=os.path.abspath(os.path.dirname(__file__))
except NameError:
    bindir='.'

# Get the station metadata (names and locations)
station_names=pandas.read_csv("%s/../raw_data/names.csv" % bindir,
                              skipinitialspace=True,quotechar="'",
                              encoding='utf-8')
if not args.id in station_names.SEF_ID.values:
    raise ValueError("Unrecognised station ID %s" % args.id)

# Get the known-bad stations
#known_bad=pandas.read_csv("%s/../raw_data/known_bad.csv" % bindir,
#                              skipinitialspace=True,quotechar="'",
#                              encoding='utf-8')

station_locations=pandas.read_csv("%s/../raw_data/Positions.csv" % bindir,
                              skipinitialspace=True,quotechar="'")
if not args.id in station_locations.SEF_ID.values:
    raise ValueError("Station %s has no location" % args.id)

# Load the raw data from Juerg's spreadsheet
try:
   original_name=station_names[station_names['SEF_ID']==args.id]['As-digitised'].values[0]
   assigned_number=int(station_names[station_names['SEF_ID']==args.id]['Number'].values[0])
   station_lat=station_locations[station_locations['SEF_ID']==args.id]['lat'].values[0]
   station_lon=station_locations[station_locations['SEF_ID']==args.id]['lon'].values[0]
   station_height=station_locations[station_locations['SEF_ID']==args.id]['height'].values[0]
except IndexError:
   raise Exception("Missing original name for %s" % args.id)
spreadsheet_file="%s/../raw_data/South_America_1902.%s.csv" % (bindir,original_name)
if not os.path.isfile(spreadsheet_file):
   raise Exception("Missing file %s" % spreadsheet_file)
raw_data=pandas.read_csv(spreadsheet_file)
n_values=len(raw_data)

# Make a SEF data structure and populate the common elements
# Argentine national time in 1902 was 4h17m behind UTC
ob_time=[1117 if raw_data['MONTH'].values[i]>8 else 1817 for i in range(n_values)]
common=SEF.create(version='0.0.1')
common['ID']=args.id
common['Name']=original_name
common['Lat']=station_lat
common['Lon']=station_lon
common['Alt']=station_height
common['Source']=None
common['Repo']=None
common['Data']=pandas.DataFrame(
       {'Year'  : raw_data['YEAR'].values,
        'Month' : raw_data['MONTH'].values,
        'Day'   : raw_data['DAY'].values,
        'HHMM'  : ob_time})

# Where to put the output files
opdir="%s/../../../sef/Argentinian_DWR/1902" % bindir
#if args.id in known_bad.SEF_ID.values:
#   opdir="%s/../../../sef/Argentinian_DWR/known_bad/1902" % bindir
if not os.path.isdir(opdir):
    os.makedirs(opdir)

# MSLP
sef_v=copy.deepcopy(common)
sef_v['Var']='msl pressure'
sef_v['Units']='hPa'
sef_v['Meta']='PTC=T,PGC=?'
raw_value=pandas.to_numeric(raw_data.iloc[:, 5],errors='coerce')
sef_v['Data']=pandas.concat([sef_v['Data'],
                            pandas.DataFrame(
                               {'TimeF' : [0] * n_values,   # Instantanious
                                'Value' : (raw_value/0.75006156130264).tolist(),
                                'Meta'  : ''})],
                            axis=1,sort=False)
sef_v['Data']['Meta']=raw_value.map(lambda x: "Original=%5.1fmm" % x,
                                              na_action='ignore')
SEF.write_file(sef_v,
               "%s/%s_MSLP.tsv" % (opdir,args.id))


# Tair
sef_v=copy.deepcopy(common)
sef_v['Var']='temperature'
sef_v['Units']='K'
raw_value=pandas.to_numeric(raw_data.iloc[:, 7],errors='coerce')
sef_v['Data']=pandas.concat([sef_v['Data'],
                            pandas.DataFrame(
                               {'TimeF' : [0] * n_values,   # Instantanious
                                'Value' : (raw_value+273.15).tolist(),
                                'Meta'  : ''})],
                            axis=1,sort=False)
sef_v['Data']['Meta']=raw_data.iloc[:, 7].map(lambda x: "Original=%dC" % x,
                                              na_action='ignore')
SEF.write_file(sef_v,
               "%s/%s_T.tsv" % (opdir,args.id))

# Tmax
sef_v=copy.deepcopy(common)
sef_v['Var']='maximum temperature'
sef_v['Units']='K'
raw_value=pandas.to_numeric(raw_data.iloc[:, 9],errors='coerce')
sef_v['Data']=pandas.concat([sef_v['Data'],
                            pandas.DataFrame(
                               {'TimeF' : [13] * n_values,   # Max since last
                                'Value' : (raw_value+273.15).tolist(),
                                'Meta'  : ''})],
                            axis=1,sort=False)
sef_v['Data']['Meta']=raw_value.map(lambda x: "Original=%dC" % x,
                                              na_action='ignore')
SEF.write_file(sef_v,
               "%s/%s_Tmax.tsv" % (opdir,args.id))

# Tmin
sef_v=copy.deepcopy(common)
sef_v['Var']='minimum temperature'
sef_v['Units']='K'
raw_value=pandas.to_numeric(raw_data.iloc[:, 10],errors='coerce')
sef_v['Data']=pandas.concat([sef_v['Data'],
                            pandas.DataFrame(
                               {'TimeF' : [13] * n_values,   # Max since last
                                'Value' : (raw_value+273.15).tolist(),
                                'Meta'  : ''})],
                            axis=1,sort=False)
sef_v['Data']['Meta']=raw_value.map(lambda x: "Original=%dC" % x,
                                              na_action='ignore')
SEF.write_file(sef_v,
               "%s/%s_Tmin.tsv" % (opdir,args.id))

# RH
sef_v=copy.deepcopy(common)
sef_v['Var']='relative humidity'
sef_v['Units']='%'
raw_value=pandas.to_numeric(raw_data.iloc[:, 11],errors='coerce')
sef_v['Data']=pandas.concat([sef_v['Data'],
                            pandas.DataFrame(
                               {'TimeF' : [0] * n_values,   # Instantanious
                                'Value' : (raw_value).tolist(),
                                'Meta'  : ''})],
                            axis=1,sort=False)
sef_v['Data']['Meta']=raw_value.map(lambda x: "Original=%d%%" % x,
                                              na_action='ignore')
SEF.write_file(sef_v,
               "%s/%s_RH.tsv" % (opdir,args.id))


Run the script above for all the stations

#!/usr/bin/env python

# Make SEF files for all the Argentina 1902 stations

import os
import pandas
import subprocess

# Find the directory with this script in
try:
    bindir=os.path.abspath(os.path.dirname(__file__))
except NameError:
    bindir='.'

# Get the station metadata (names and locations)
station_names=pandas.read_csv("%s/../raw_data/names.csv" % bindir,
                              skipinitialspace=True,quotechar="'",
                              encoding='utf-8')

for id in station_names.SEF_ID.values:
    print(id)
    proc = subprocess.Popen("%s/convert_station.py --id=%s" % (bindir,id),
                            shell=True)
    (out, err) = proc.communicate()