Summarise records in CSV format¶
This script will search for a particular set of records, and print out a summary of each in CSV format, for easy viewing in a spreadsheet.
This won’t work well for arbitrary NARA records, but it should do the ship logbooks.
#!/usr/bin/env python
# Make a csv file, that matches Kevin's spreadsheet, from the TNA Archive dump
import os
import sys
import json
import argparse
import re
from calendar import monthrange
parser = argparse.ArgumentParser()
parser.add_argument("--rg", help="Record group", type=int, required=True)
parser.add_argument(
"--subgroup", help="Record group file", type=int, required=False, default=None
)
parser.add_argument("--series", help="Series", type=int, required=True)
parser.add_argument("--match", help="Filter", type=str, required=False, default=None)
parser.add_argument("--title", default=False, action="store_true")
args = parser.parse_args()
# Write to stdout
fw = sys.stdout
# Input files
fileN = []
filed = "%s/WW2_US_logs/US_TNA_Catalog/record-groups/rg_%03d/" % (
os.getenv("SCRATCH"),
args.rg,
)
if args.subgroup is not None:
fileN.append("%s/rg_%03d-%03d.json" % (filed, args.rg, args.subgroup))
else:
files = os.listdir(filed)
for fn in files:
fileN.append("%s/%s" % (filed, fn))
# Most of the work is in parsing the 'title' field, which is a string containing
# the ship name, maybe class and number ('DD-119'), maybe date or date range, and
# maybe other stuff, in several different formats.
mnames = (
"january",
"february",
"march",
"april",
"may",
"june",
"july",
"august",
"september",
"october",
"november",
"december",
)
def getNameFromTitle(title):
shipName = title
# If title contains ':' strip everything afterwards
pIdx = shipName.find(":")
if pIdx > 0:
shipName = shipName[: pIdx ]
# If title contains '(' strip everything afterwards
pIdx = shipName.find("(")
if pIdx > 0:
shipName = shipName[: pIdx ]
# If title contains a date, strip date and everything afterwards
rx = re.search("\d+/\d+/", shipName)
if rx is not None:
shipName = shipName[: rx.span()[0]]
rx = re.search("- \w+ \d\d\d\d/", shipName)
if rx is not None:
shipName = shipName[: rx.span()[0]]
# Get rid of 'Log of the' and similar
pIdx = shipName.find(' of the ')
if pIdx > 0:
shipName = shipName[(pIdx+8) : ]
pIdx = shipName.find(' of ')
if pIdx > 0:
shipName = shipName[(pIdx+4) : ]
# Standardise on USS, so convert U.S.S to USS
pIdx = shipName.find('U.S.S.')
if pIdx > -1:
shipName = "USS %s" % shipName[(pIdx+6) : ]
return shipName
def getClassFromTitle(title):
rx = re.search("\((\w+-\d+)\)", title)
if rx is not None:
classN = rx.group(1)
return classN
raise Exception("Class not found")
def getStartDateFromTitle(title):
rx = re.search("(\d+)/(\d+)/(\d+)\D+(\d+)/(\d+)/(\d+)", title)
if rx is not None:
return [int(rx.groups()[2]), int(rx.groups()[0]), int(rx.groups()[1])]
rx = re.search("- \w+ (\d\d\d\d)/", title)
if rx is not None:
mmatch = -1
for count in range(len(mnames)):
if title.lower().find(mnames[count]) > -1:
mmatch = count + 1
break
return [int(rx.groups()[0]), mmatch, 1]
raise Exception("Date not found")
def getEndDateFromTitle(title):
rx = re.search("(\d+)/(\d+)/(\d+)\D+(\d+)/(\d+)/(\d+)", title)
if rx is not None:
return [int(rx.groups()[5]), int(rx.groups()[3]), int(rx.groups()[4])]
rx = re.search("- \w+ (\d\d\d\d)/", title)
if rx is not None:
mmatch = -1
for count in range(len(mnames)):
if title.lower().find(mnames[count]) > -1:
mmatch = count + 1
break
return [int(rx.groups()[0]), mmatch, monthrange(rx.groups()[0], mmatch)[1]]
raise Exception("Date not found")
def getEndDate(record):
try:
ed = record["coverageDates"]["coverageEndDate"]
if "logicalDate" in ed:
return (
int(ed["logicalDate"][:4]),
int(ed["logicalDate"][5:7]),
int(ed["logicalDate"][8:10]),
)
else:
return (
int(ed["year"]),
int(ed["month"]),
monthrange(int(ed["year"]), int(ed["month"]))[1],
)
except Exception:
return getEndDateFromTitle(record["title"])
def getStartDate(record):
try:
ed = record["coverageDates"]["coverageStartDate"]
if "logicalDate" in ed:
return (
int(ed["logicalDate"][:4]),
int(ed["logicalDate"][5:7]),
int(ed["logicalDate"][8:10]),
)
else:
return (
int(ed["year"]),
int(ed["month"]),
1,
)
except Exception:
return getStartDateFromTitle(record["title"])
# Add the column titles if requested
if args.title:
fw.write("%-30s,%-12s,%-12s,%-12s,%-12s,%-20s,%-10s,%-10s,%-50s,%-12s,%s\n" % (
"Ship Name","Hull No.","Record Group","Series NAID","Record Entry","Container",
"StartDate","EndDate","Nara URL","#Images","Document URL")
)
# Get rid of any commas (bad in CSV)
def stripC(inpS):
opS = inpS
while opS.find(",") >= 0:
opS = opS.replace(",", " ")
return opS
for filen in fileN:
fd = open(filen, "r")
while True:
line = fd.readline()
if not line:
break
if line[0] == ",":
continue
if args.match is not None and args.match not in line:
continue
if line[:2] == "{[":
line = line[2:]
if line[-3:-1] == "]}":
line = line[:-3] + "\n"
try:
fj = json.loads(line)
except:
print(line)
break
try:
base = fj["description"]["fileUnit"]
except Exception:
continue
try:
if int(base["parentSeries"]["naId"]) != args.series:
continue
except Exception:
continue
try:
shipName = getNameFromTitle(base["title"])
fw.write("%-30s," % stripC(shipName))
except Exception:
fw.write("%-30s," % " ")
try:
hullNo = getClassFromTitle(base["title"])
fw.write('"%-12s",' % stripC(hullNo))
except Exception:
fw.write("%-12s," % " ")
# Fixed data - record group and parent series
fw.write("%-12d," % args.rg)
fw.write("%-12d," % args.series)
# Record entry?
try:
vcn = base["variantControlNumberArray"]["variantControlNumber"]
for entry in vcn:
if entry["type"]["naId"] == "10675882":
fw.write("%-12s," % stripC(entry["number"]))
break
except Exception as e:
# sys.stderr.write(repr(e))
fw.write("%-12s," % " ")
# Container
try:
pFile = base["physicalOccurrenceArray"]["fileUnitPhysicalOccurrence"]
cid = pFile["mediaOccurrenceArray"]["mediaOccurrence"]["containerId"]
fw.write("%-20s," % stripC(cid))
except Exception as e:
# sys.stderr.write(repr(e))
fw.write("%-20s," % " ")
# dates
try:
ed = getStartDate(base)
fw.write("%04d-%02d-%02d," % (ed[0], ed[1], ed[2]))
except Exception as e:
fw.write("%10s," % " ")
try:
ed = getEndDate(base)
fw.write("%04d-%02d-%02d," % (ed[0], ed[1], ed[2]))
except Exception as e:
fw.write("%10s," % " ")
# Nara URL
try:
fw.write("https://catalog.archives.gov/id/%-18s," % base["naId"])
except Exception:
fw.write("%-50s," % " ")
# Count of images
try:
nImages = len(fj["objects"]["object"]) - 1 # Don't count the pdf
fw.write("%-12d," % nImages)
except Exception as e:
# sys.stderr.write(repr(e))
fw.write("%-12s," % " ")
# Get the pdf url, if there is one
try:
docs = fj["objects"]["object"]
pdfU = None
for doc in docs:
a_url = doc["file"]["@url"]
ftype = a_url[-3:].lower()
if ftype == "pdf":
pdfU = a_url
break
if pdfU is not None:
fw.write("%s" % pdfU)
else:
fw.write("%12s" % " ")
except Exception:
fw.write("%12s" % " ")
fw.write("\n")
fd.close()