Full page extraction¶
Enough tests - let’s try and extract all the data from a page - the station metadata, the daily rainfall observations, and the monthly totals.
The code is a straightforward extension - adding required structures for each data section. One complication is that extracting everything at once runs into Gemini’s limit on maximum output. So I’m doing it in four steps - metadata, daily obs Jan-Jun, daily obs Jul-Dec, and monthly totals.
#!/usr/bin/env python3
# Get all required data from the page
import os
import PIL.Image
import google.generativeai as genai
import typing_extensions as typing
# You will need an API key get it from https://ai.google.dev/gemini-api/docs/api-key
# I keep my API key in the .gemini_api file in my home directory.
with open("%s/.gemini_api" % os.getenv("HOME"), "r") as file:
api_key = file.read().strip()
# Default protocol is 'GRPC' - but that is blocked by the Office firewall.
# Use 'REST' instead.
genai.configure(api_key=api_key, transport="rest")
# Specify a structure for the station metadata
class MetaData(typing.TypedDict):
Year: int
StationNumber: int
Location: str
County: str
Sea_level_height: int
Gauge_diameter: int
Gauge_height_feet: int
Gauge_height_inches: int
# Specify a structure for the daily observations
class Daily(typing.TypedDict):
Day: int
rainfall: str
class Monthly(typing.TypedDict):
Month: str
rainfall: list[Daily]
class Annual(typing.TypedDict):
Month: list[Monthly]
class Totals(typing.TypedDict):
Totals: list[str]
# Load the sample image
img = PIL.Image.open(
"../../images/jpgs_300dpi/Devon_1941-1950_RainNos_1651-1689-293.jpg"
)
# Pick an AI to use - this one is the latest as of 2025-01-29
model = genai.GenerativeModel("gemini-2.0-flash-exp")
# Get metadata from the image
result = model.generate_content(
[
img,
"\n\n",
"List the station metadata",
],
generation_config=genai.GenerationConfig(
response_mime_type="application/json", response_schema=MetaData
),
)
# Structured data as JSON
with open("metadata.json", "w") as file:
file.write(result.text)
with open("metadata.txt", "w") as file:
file.write(str(result))
# Get the daily observations from the image
# In two batches because of output size limits
result = model.generate_content(
[
img,
"\n\n",
"List the daily observations for months January to June. "
+ "Be careful of missing data. Several days have missing data and "
+ "These days will have an entry that is blank or has a dash '-'. "
+ "Return the character '-' for missing data.",
],
generation_config=genai.GenerationConfig(
response_mime_type="application/json", response_schema=Annual
),
)
with open("daily1.json", "w") as file:
file.write(result.text)
with open("daily1.txt", "w") as file:
file.write(str(result))
result = model.generate_content(
[
img,
"\n\n",
"List the daily observations for months July to December. "
+ "Be careful of missing data. Several days have missing data and "
+ "These days will have an entry that is blank or has a dash '-'. "
+ "Return the character '-' for missing data.",
],
generation_config=genai.GenerationConfig(
response_mime_type="application/json", response_schema=Annual
),
)
with open("daily2.json", "w") as file:
file.write(result.text)
with open("daily2.txt", "w") as file:
file.write(str(result))
# Get the Monthly totals
result = model.generate_content(
[img, "\n\n", "List the monthly totals for each month."],
generation_config=genai.GenerationConfig(
response_mime_type="application/json", response_schema=Totals
),
)
with open("totals.json", "w") as file:
file.write(result.text)
with open("totals.txt", "w") as file:
file.write(str(result))
Producing four JSON output files:
{
"County": "DEVON",
"Gauge_diameter": 5,
"Gauge_height_feet": 1,
"Gauge_height_inches": 3,
"Location": "BADWORTHY COTTAGE, S. BRENT",
"Sea_level_height": 550,
"StationNumber": 1678,
"Year": 1947
}
{
"Month":[
{
"Month":"Jan",
"rainfall":[
{"Day":1,"rainfall":"0.54"},
{"Day":2,"rainfall":"0.31"},
{"Day":3,"rainfall":"0.40"},
{"Day":4,"rainfall":"-"},
{"Day":5,"rainfall":"1.03"},
{"Day":6,"rainfall":"0.43"},
{"Day":7,"rainfall":"0.04"},
{"Day":8,"rainfall":"0.62"},
{"Day":9,"rainfall":"0.06"},
{"Day":10,"rainfall":"1.24"},
{"Day":11,"rainfall":"0.49"},
{"Day":12,"rainfall":"0.33"},
{"Day":13,"rainfall":"1.05"},
{"Day":14,"rainfall":"0.11"},
{"Day":15,"rainfall":"-"},
{"Day":16,"rainfall":"0.15"},
{"Day":17,"rainfall":"0.05"},
{"Day":18,"rainfall":"-"},
{"Day":19,"rainfall":"-"},
{"Day":20,"rainfall":"-"},
{"Day":21,"rainfall":"-"},
{"Day":22,"rainfall":"-"},
{"Day":23,"rainfall":"-"},
{"Day":24,"rainfall":"-"},
{"Day":25,"rainfall":"-"},
{"Day":26,"rainfall":"-"},
{"Day":27,"rainfall":"-"},
{"Day":28,"rainfall":"0.02"},
{"Day":29,"rainfall":"-"},
{"Day":30,"rainfall":"0.63"},
{"Day":31,"rainfall":"-"}
]
},
{
"Month":"Feb",
"rainfall":[
{"Day":1,"rainfall":"1.19"},
{"Day":2,"rainfall":"0.42"},
{"Day":3,"rainfall":"0.67"},
{"Day":4,"rainfall":"0.11"},
{"Day":5,"rainfall":"-"},
{"Day":6,"rainfall":"-"},
{"Day":7,"rainfall":"0.29"},
{"Day":8,"rainfall":"-"},
{"Day":9,"rainfall":"0.17"},
{"Day":10,"rainfall":"-"},
{"Day":11,"rainfall":"0.01"},
{"Day":12,"rainfall":"-"},
{"Day":13,"rainfall":"-"},
{"Day":14,"rainfall":"-"},
{"Day":15,"rainfall":"1.74"},
{"Day":16,"rainfall":"0.12"},
{"Day":17,"rainfall":"0.90"},
{"Day":18,"rainfall":"0.40"},
{"Day":19,"rainfall":"0.36"},
{"Day":20,"rainfall":"0.65"},
{"Day":21,"rainfall":"0.17"},
{"Day":22,"rainfall":"0.55"},
{"Day":23,"rainfall":"0.39"},
{"Day":24,"rainfall":"0.05"},
{"Day":25,"rainfall":"1.00"},
{"Day":26,"rainfall":"-"},
{"Day":27,"rainfall":"-"},
{"Day":28,"rainfall":"0.22"},
{"Day":29,"rainfall":"-"},
{"Day":30,"rainfall":"0.32"},
{"Day":31,"rainfall":"-"}
]
},
{
"Month":"Mar",
"rainfall":[
{"Day":1,"rainfall":"-"},
{"Day":2,"rainfall":"-"},
{"Day":3,"rainfall":"-"},
{"Day":4,"rainfall":"0.03"},
{"Day":5,"rainfall":"0.79"},
{"Day":6,"rainfall":"-"},
{"Day":7,"rainfall":"0.15"},
{"Day":8,"rainfall":"-"},
{"Day":9,"rainfall":"1.23"},
{"Day":10,"rainfall":"1.08"},
{"Day":11,"rainfall":"0.28"},
{"Day":12,"rainfall":"2.28"},
{"Day":13,"rainfall":"0.35"},
{"Day":14,"rainfall":"0.17"},
{"Day":15,"rainfall":"-"},
{"Day":16,"rainfall":"-"},
{"Day":17,"rainfall":"-"},
{"Day":18,"rainfall":"-"},
{"Day":19,"rainfall":"0.12"},
{"Day":20,"rainfall":"1.01"},
{"Day":21,"rainfall":"0.16"},
{"Day":22,"rainfall":"0.18"},
{"Day":23,"rainfall":"0.91"},
{"Day":24,"rainfall":"0.01"},
{"Day":25,"rainfall":"-"},
{"Day":26,"rainfall":"-"},
{"Day":27,"rainfall":"0.45"},
{"Day":28,"rainfall":"-"},
{"Day":29,"rainfall":"0.11"},
{"Day":30,"rainfall":"0.47"},
{"Day":31,"rainfall":"0.04"}
]
},
{
"Month":"Apr",
"rainfall":[
{"Day":1,"rainfall":"0.05"},
{"Day":2,"rainfall":"0.01"},
{"Day":3,"rainfall":"0.56"},
{"Day":4,"rainfall":"0.10"},
{"Day":5,"rainfall":"1.60"},
{"Day":6,"rainfall":"0.34"},
{"Day":7,"rainfall":"0.32"},
{"Day":8,"rainfall":"0.15"},
{"Day":9,"rainfall":"-"},
{"Day":10,"rainfall":"-"},
{"Day":11,"rainfall":"-"},
{"Day":12,"rainfall":"-"},
{"Day":13,"rainfall":"-"},
{"Day":14,"rainfall":"0.05"},
{"Day":15,"rainfall":"0.06"},
{"Day":16,"rainfall":"-"},
{"Day":17,"rainfall":"0.74"},
{"Day":18,"rainfall":"-"},
{"Day":19,"rainfall":"-"},
{"Day":20,"rainfall":"-"},
{"Day":21,"rainfall":"-"},
{"Day":22,"rainfall":"-"},
{"Day":23,"rainfall":"-"},
{"Day":24,"rainfall":"0.52"},
{"Day":25,"rainfall":"0.05"},
{"Day":26,"rainfall":"-"},
{"Day":27,"rainfall":"-"},
{"Day":28,"rainfall":"-"},
{"Day":29,"rainfall":"-"},
{"Day":30,"rainfall":"0.37"},
{"Day":31,"rainfall":"-"}
]
},
{
"Month":"May",
"rainfall":[
{"Day":1,"rainfall":"0.14"},
{"Day":2,"rainfall":"0.45"},
{"Day":3,"rainfall":"0.02"},
{"Day":4,"rainfall":"0.11"},
{"Day":5,"rainfall":"0.08"},
{"Day":6,"rainfall":"0.12"},
{"Day":7,"rainfall":"0.01"},
{"Day":8,"rainfall":"-"},
{"Day":9,"rainfall":"0.14"},
{"Day":10,"rainfall":"-"},
{"Day":11,"rainfall":"-"},
{"Day":12,"rainfall":"0.23"},
{"Day":13,"rainfall":"-"},
{"Day":14,"rainfall":"0.52"},
{"Day":15,"rainfall":"-"},
{"Day":16,"rainfall":"-"},
{"Day":17,"rainfall":"0.05"},
{"Day":18,"rainfall":"0.04"},
{"Day":19,"rainfall":"-"},
{"Day":20,"rainfall":"-"},
{"Day":21,"rainfall":"-"},
{"Day":22,"rainfall":"-"},
{"Day":23,"rainfall":"0.02"},
{"Day":24,"rainfall":"-"},
{"Day":25,"rainfall":"-"},
{"Day":26,"rainfall":"-"},
{"Day":27,"rainfall":"-"},
{"Day":28,"rainfall":"-"},
{"Day":29,"rainfall":"-"},
{"Day":30,"rainfall":"-"},
{"Day":31,"rainfall":"-"}
]
},
{
"Month":"June",
"rainfall":[
{"Day":1,"rainfall":"-"},
{"Day":2,"rainfall":"-"},
{"Day":3,"rainfall":"0.42"},
{"Day":4,"rainfall":"0.39"},
{"Day":5,"rainfall":"1.10"},
{"Day":6,"rainfall":"0.22"},
{"Day":7,"rainfall":"0.02"},
{"Day":8,"rainfall":"0.58"},
{"Day":9,"rainfall":"0.52"},
{"Day":10,"rainfall":"0.37"},
{"Day":11,"rainfall":"-"},
{"Day":12,"rainfall":"0.11"},
{"Day":13,"rainfall":"-"},
{"Day":14,"rainfall":"-"},
{"Day":15,"rainfall":"-"},
{"Day":16,"rainfall":"-"},
{"Day":17,"rainfall":"-"},
{"Day":18,"rainfall":"-"},
{"Day":19,"rainfall":"-"},
{"Day":20,"rainfall":"1.50"},
{"Day":21,"rainfall":"-"},
{"Day":22,"rainfall":"-"},
{"Day":23,"rainfall":"-"},
{"Day":24,"rainfall":"-"},
{"Day":25,"rainfall":"0.03"},
{"Day":26,"rainfall":"-"},
{"Day":27,"rainfall":"0.51"},
{"Day":28,"rainfall":"0.56"},
{"Day":29,"rainfall":"-"},
{"Day":30,"rainfall":"0.08"},
{"Day":31,"rainfall":"-"}
]
}
]
}
{
"Month": [
{
"Month": "July",
"rainfall": [
{ "Day": 1, "rainfall": "-"},
{ "Day": 2, "rainfall": "-"},
{ "Day": 3, "rainfall": "-"},
{ "Day": 4, "rainfall": "-"},
{ "Day": 5, "rainfall": "-"},
{ "Day": 6, "rainfall": "-"},
{ "Day": 7, "rainfall": "-"},
{ "Day": 8, "rainfall": "-"},
{ "Day": 9, "rainfall": "-"},
{ "Day": 10, "rainfall":"-"},
{ "Day": 11, "rainfall": "-"},
{ "Day": 12, "rainfall": "-"},
{ "Day": 13, "rainfall": "-"},
{ "Day": 14, "rainfall": "-"},
{ "Day": 15, "rainfall": "-"},
{ "Day": 16, "rainfall":"-"},
{ "Day": 17, "rainfall": "-"},
{ "Day": 18, "rainfall": "-"},
{ "Day": 19, "rainfall": "-"},
{ "Day": 20, "rainfall": "-"},
{ "Day": 21, "rainfall":"-"},
{ "Day": 22, "rainfall": "-"},
{ "Day": 23, "rainfall": "-"},
{ "Day": 24, "rainfall": "-"},
{ "Day": 25, "rainfall": "-"},
{ "Day": 26, "rainfall": "-"},
{ "Day": 27, "rainfall": "-"},
{ "Day": 28, "rainfall": "-"},
{ "Day": 29, "rainfall": "-"},
{ "Day": 30, "rainfall": "-"},
{ "Day": 31, "rainfall": "-"}
]
},
{
"Month": "August",
"rainfall": [
{ "Day": 1, "rainfall": "08"},
{ "Day": 2, "rainfall": "06"},
{ "Day": 3, "rainfall":"1.59"},
{ "Day": 4, "rainfall": "39"},
{ "Day": 5, "rainfall": "-"},
{ "Day": 6, "rainfall": "22"},
{ "Day": 7, "rainfall": "64"},
{ "Day": 8, "rainfall":"58"},
{ "Day": 9, "rainfall": "52"},
{ "Day": 10, "rainfall":"37"},
{ "Day": 11, "rainfall": "-"},
{ "Day": 12, "rainfall":"11"},
{ "Day": 13, "rainfall":"03"},
{ "Day": 14, "rainfall": "-"},
{ "Day": 15, "rainfall": "-"},
{ "Day": 16, "rainfall": "-"},
{ "Day": 17, "rainfall": "-"},
{ "Day": 18, "rainfall": "-"},
{ "Day": 19, "rainfall": "-"},
{ "Day": 20, "rainfall": "1.50"},
{ "Day": 21, "rainfall":"75"},
{ "Day": 22, "rainfall": "-"},
{ "Day": 23, "rainfall":"-"},
{ "Day": 24, "rainfall": "-"},
{ "Day": 25, "rainfall": "03"},
{ "Day": 26, "rainfall":"-"},
{ "Day": 27, "rainfall": "-"},
{ "Day": 28, "rainfall": "03"},
{ "Day": 29, "rainfall": "-"},
{ "Day": 30, "rainfall": "-"},
{ "Day": 31, "rainfall": "-"}
]
},
{
"Month":"September",
"rainfall": [
{ "Day": 1, "rainfall": "-"},
{ "Day": 2, "rainfall":"-"},
{ "Day": 3, "rainfall":"-"},
{ "Day": 4, "rainfall": "-"},
{ "Day": 5, "rainfall": "-"},
{ "Day": 6, "rainfall":"41"},
{ "Day": 7, "rainfall":"11"},
{ "Day": 8, "rainfall": "-"},
{ "Day": 9, "rainfall": "-"},
{ "Day": 10, "rainfall":"-"},
{ "Day": 11, "rainfall":"41"},
{ "Day": 12, "rainfall":"1.33"},
{ "Day": 13, "rainfall":"-"},
{ "Day": 14, "rainfall":"-"},
{ "Day": 15, "rainfall":"-"},
{ "Day": 16, "rainfall":"-"},
{ "Day": 17, "rainfall":"-"},
{ "Day": 18, "rainfall":"-"},
{ "Day": 19, "rainfall":"-"},
{ "Day": 20, "rainfall":"-"},
{ "Day": 21, "rainfall":"-"},
{ "Day": 22, "rainfall": "28"},
{ "Day": 23, "rainfall": "1.41"},
{ "Day": 24, "rainfall": "-"},
{ "Day": 25, "rainfall":"-"},
{ "Day": 26, "rainfall":"-"},
{ "Day": 27, "rainfall":"-"},
{ "Day": 28, "rainfall":"-"},
{ "Day": 29, "rainfall": "-"},
{ "Day": 30, "rainfall":"-"},
{ "Day": 31, "rainfall": "-"}
]
},
{
"Month":"October",
"rainfall": [
{ "Day": 1, "rainfall": "-"},
{ "Day": 2, "rainfall":"-"},
{ "Day": 3, "rainfall":"-"},
{ "Day": 4, "rainfall": "-"},
{ "Day": 5, "rainfall": "-"},
{ "Day": 6, "rainfall":"-"},
{ "Day": 7, "rainfall":"-"},
{ "Day": 8, "rainfall":"-"},
{ "Day": 9, "rainfall": "36"},
{ "Day": 10, "rainfall":"-"},
{ "Day": 11, "rainfall": "-"},
{ "Day": 12, "rainfall": "-"},
{ "Day": 13, "rainfall":"-"},
{ "Day": 14, "rainfall": "-"},
{ "Day": 15, "rainfall":"-"},
{ "Day": 16, "rainfall":"-"},
{ "Day": 17, "rainfall":"-"},
{ "Day": 18, "rainfall":"-"},
{ "Day": 19, "rainfall":"-"},
{ "Day": 20, "rainfall":"-"},
{ "Day": 21, "rainfall": "-"},
{ "Day": 22, "rainfall":"-"},
{ "Day": 23, "rainfall":"-"},
{ "Day": 24, "rainfall":"-"},
{ "Day": 25, "rainfall":"-"},
{ "Day": 26, "rainfall":"-"},
{ "Day": 27, "rainfall":"-"},
{ "Day": 28, "rainfall":"-"},
{ "Day": 29, "rainfall":"-"},
{ "Day": 30, "rainfall": "-"},
{ "Day": 31, "rainfall": "-"}
]
},
{
"Month":"November",
"rainfall": [
{ "Day": 1, "rainfall":"07"},
{ "Day": 2, "rainfall":"63"},
{ "Day": 3, "rainfall": "x"},
{ "Day": 4, "rainfall":"65"},
{ "Day": 5, "rainfall":"-"},
{ "Day": 6, "rainfall":"-"},
{ "Day": 7, "rainfall":"02"},
{ "Day": 8, "rainfall":"75"},
{ "Day": 9, "rainfall":"87"},
{ "Day": 10, "rainfall":"26"},
{ "Day": 11, "rainfall":"x"},
{ "Day": 12, "rainfall":"09"},
{ "Day": 13, "rainfall":"-"},
{ "Day": 14, "rainfall":"72"},
{ "Day": 15, "rainfall": "03"},
{ "Day": 16, "rainfall":"-"},
{ "Day": 17, "rainfall": "43"},
{ "Day": 18, "rainfall":"64"},
{ "Day": 19, "rainfall":"18"},
{ "Day": 20, "rainfall":"28"},
{ "Day": 21, "rainfall":"35"},
{ "Day": 22, "rainfall": "1.29"},
{ "Day": 23, "rainfall": "04"},
{ "Day": 24, "rainfall":"12"},
{ "Day": 25, "rainfall": "13"},
{ "Day": 26, "rainfall":"-"},
{ "Day": 27, "rainfall":"-"},
{ "Day": 28, "rainfall":"-"},
{ "Day": 29, "rainfall": "05"},
{ "Day": 30, "rainfall": "04"},
{ "Day": 31, "rainfall": "-"}
]
},
{
"Month":"December",
"rainfall": [
{ "Day": 1, "rainfall":"-"},
{ "Day": 2, "rainfall":"08"},
{ "Day": 3, "rainfall":"20"},
{ "Day": 4, "rainfall":"02"},
{ "Day": 5, "rainfall":"32"},
{ "Day": 6, "rainfall":"13"},
{ "Day": 7, "rainfall":"61"},
{ "Day": 8, "rainfall":"09"},
{ "Day": 9, "rainfall":"-"},
{ "Day": 10, "rainfall":"-"},
{ "Day": 11, "rainfall":"-"},
{ "Day": 12, "rainfall":"-"},
{ "Day": 13, "rainfall":"-"},
{ "Day": 14, "rainfall":"06"},
{ "Day": 15, "rainfall":"-"},
{ "Day": 16, "rainfall":"-"},
{ "Day": 17, "rainfall":"-"},
{ "Day": 18, "rainfall":"-"},
{ "Day": 19, "rainfall":"-"},
{ "Day": 20, "rainfall":"-"},
{ "Day": 21, "rainfall":"65"},
{ "Day": 22, "rainfall":"48"},
{ "Day": 23, "rainfall":"05"},
{ "Day": 24, "rainfall":"02"},
{ "Day": 25, "rainfall":"28"},
{ "Day": 26, "rainfall":"1.20"},
{ "Day": 27, "rainfall":"1.19"},
{ "Day": 28, "rainfall":"08"},
{ "Day": 29, "rainfall":"11"},
{ "Day": 30, "rainfall":"04"},
{ "Day": 31, "rainfall":"94"}
]
}
]
}
{
"Totals": [
"7.48",
"3.44",
"15.48",
"5.89",
"3.22",
"5.62",
"5.24",
"2.13",
"3.73",
"2.17",
"7.05",
"7.42"
]
}
And does it work? Well, almost. Metadata is correct, monthly totals are correct, it’s getting the right numbers for daily obs (with a few missing decimal points - but that’s easy to fix). But the daily obs are not all associated with the correct dates - it’s a bit confused by missing data.

The full data extraction costs a bit less than 10,000 tokens. About 1/4 of a U.S. cent.