Full page extraction - 10-year monthlies¶
The test on the daily-rainfall page suggested that Gemini worked almost perfectly, but struggled with the missing entries in a table. To check this, I’m going to try a test with a page that has no missing data - the 10-year monthly totals for a station. (Image)
The code is almost the same - I’ve just changed the metadata names to match the image, and tweaked the data structure for monthly totals rather than daily.
#!/usr/bin/env python3
# Get all required data from the page
# This version does a sample from the 10-year monthlies
import os
import PIL.Image
import google.generativeai as genai
import typing_extensions as typing
# You will need an API key get it from https://ai.google.dev/gemini-api/docs/api-key
# I keep my API key in the .gemini_api file in my home directory.
with open("%s/.gemini_api" % os.getenv("HOME"), "r") as file:
api_key = file.read().strip()
# Default protocol is 'GRPC' - but that is blocked by the Office firewall.
# Use 'REST' instead.
genai.configure(api_key=api_key, transport="rest")
# Specify a structure for the station metadata
class MetaData(typing.TypedDict):
Year: int
StationNumber: int
Location: str
County: str
River_basin: str
Type_of_gauge: str
Observer: str
# Specify a structure for the daily observations
class Monthly(typing.TypedDict):
Month: str
rainfall: str
class Annual(typing.TypedDict):
Year: int
rainfall: list[Monthly]
class Decadal(typing.TypedDict):
rainfall: list[Annual]
class Totals(typing.TypedDict):
Totals: list[str]
# Load the sample image
img = PIL.Image.open("../../images/monthlies/TYRain_1941-1950_25_pt1-10.jpg")
# Pick an AI to use - this one is the latest as of 2025-01-29
model = genai.GenerativeModel("gemini-2.0-flash-exp")
# Get metadata from the image
result = model.generate_content(
[
img,
"\n\n",
"List the station metadata",
],
generation_config=genai.GenerationConfig(
response_mime_type="application/json", response_schema=MetaData
),
)
# Structured data as JSON
with open("metadata.json", "w") as file:
file.write(result.text)
with open("metadata.txt", "w") as file:
file.write(str(result))
# Get the Monthly observations from the image
result = model.generate_content(
[img, "\n\n", "List the monthly observations. "],
generation_config=genai.GenerationConfig(
response_mime_type="application/json", response_schema=Decadal
),
)
with open("monthly.json", "w") as file:
file.write(result.text)
with open("monthly.txt", "w") as file:
file.write(str(result))
# Get the Annually totals
result = model.generate_content(
[img, "\n\n", "List the annual totals."],
generation_config=genai.GenerationConfig(
response_mime_type="application/json", response_schema=Totals
),
)
with open("totals.json", "w") as file:
file.write(result.text)
with open("totals.txt", "w") as file:
file.write(str(result))
Producing three JSON output files:
{
"County": "Monmouth",
"Location": "Tredegar Laboratory",
"Observer": "Tredegar Iron & Coal Co. Ltd.\nNational Coal Board.",
"River_basin": "Ebbw-2",
"StationNumber": 4007,
"Type_of_gauge": "Howard"
}
{
"rainfall": [
{
"Year": 1941,
"rainfall": [
{
"Month": "January",
"rainfall": "4.72"
},
{
"Month": "February",
"rainfall": "6.12"
},
{
"Month": "March",
"rainfall": "4.16"
},
{
"Month": "April",
"rainfall": "1.19"
},
{
"Month": "May",
"rainfall": "3.72"
},
{
"Month": "June",
"rainfall": "2.06"
},
{
"Month": "July",
"rainfall": "2.89"
},
{
"Month": "August",
"rainfall": "7.00"
},
{
"Month": "September",
"rainfall": "0.82"
},
{
"Month": "October",
"rainfall": "6.93"
},
{
"Month": "November",
"rainfall": "4.18"
},
{
"Month":"December",
"rainfall":"4.16"
}
]
},
{
"Year": 1942,
"rainfall": [
{
"Month": "January",
"rainfall": "5.42"
},
{
"Month": "February",
"rainfall": "0.55"
},
{
"Month": "March",
"rainfall": "4.33"
},
{
"Month": "April",
"rainfall": "3.59"
},
{
"Month": "May",
"rainfall": "8.98"
},
{
"Month":"June",
"rainfall":"0.31"
},
{
"Month": "July",
"rainfall":"3.42"
},
{
"Month":"August",
"rainfall":"6.54"
},
{
"Month": "September",
"rainfall": "3.95"
},
{
"Month":"October",
"rainfall":"4.52"
},
{
"Month":"November",
"rainfall":"0.78"
},
{
"Month":"December",
"rainfall":"9.00"
}
]
},
{
"Year": 1943,
"rainfall": [
{
"Month": "January",
"rainfall": "11.82"
},
{
"Month": "February",
"rainfall": "4.44"
},
{
"Month": "March",
"rainfall": "1.22"
},
{
"Month": "April",
"rainfall": "1.68"
},
{
"Month": "May",
"rainfall": "5.92"
},
{
"Month":"June",
"rainfall":"4.44"
},
{
"Month": "July",
"rainfall":"3.43"
},
{
"Month":"August",
"rainfall":"5.30"
},
{
"Month": "September",
"rainfall":"5.30"
},
{
"Month":"October",
"rainfall":"5.21"
},
{
"Month":"November",
"rainfall":"4.21"
},
{
"Month":"December",
"rainfall":"3.30"
}
]
},
{
"Year": 1944,
"rainfall": [
{
"Month": "January",
"rainfall": "6.68"
},
{
"Month": "February",
"rainfall": "1.19"
},
{
"Month": "March",
"rainfall": "0.38"
},
{
"Month": "April",
"rainfall": "2.96"
},
{
"Month": "May",
"rainfall": "2.11"
},
{
"Month":"June",
"rainfall":"3.41"
},
{
"Month":"July",
"rainfall":"3.56"
},
{
"Month":"August",
"rainfall":"4.40"
},
{
"Month":"September",
"rainfall":"5.55"
},
{
"Month":"October",
"rainfall":"8.79"
},
{
"Month":"November",
"rainfall":"9.75"
},
{
"Month":"December",
"rainfall":"5.31"
}
]
},
{
"Year": 1945,
"rainfall": [
{
"Month": "January",
"rainfall": "4.71"
},
{
"Month": "February",
"rainfall": "7.64"
},
{
"Month": "March",
"rainfall": "2.63"
},
{
"Month": "April",
"rainfall": "2.81"
},
{
"Month": "May",
"rainfall":"3.76"
},
{
"Month":"June",
"rainfall":"5.86"
},
{
"Month":"July",
"rainfall":"3.95"
},
{
"Month":"August",
"rainfall":"3.83"
},
{
"Month":"September",
"rainfall":"4.58"
},
{
"Month":"October",
"rainfall":"7.54"
},
{
"Month":"November",
"rainfall":"0.80"
},
{
"Month":"December",
"rainfall":"10.19"
}
]
},
{
"Year": 1946,
"rainfall": [
{
"Month": "January",
"rainfall": "8.55"
},
{
"Month": "February",
"rainfall": "6.00"
},
{
"Month": "March",
"rainfall":"2.48"
},
{
"Month": "April",
"rainfall": "1.96"
},
{
"Month": "May",
"rainfall":"5.14"
},
{
"Month":"June",
"rainfall":"5.68"
},
{
"Month":"July",
"rainfall":"2.66"
},
{
"Month":"August",
"rainfall":"9.30"
},
{
"Month":"September",
"rainfall":"9.23"
},
{
"Month":"October",
"rainfall":"1.63"
},
{
"Month":"November",
"rainfall":"14.24"
},
{
"Month":"December",
"rainfall":"7.17"
}
]
},
{
"Year": 1947,
"rainfall": [
{
"Month": "January",
"rainfall": "5.91"
},
{
"Month": "February",
"rainfall": "2.16"
},
{
"Month":"March",
"rainfall":"12.37"
},
{
"Month": "April",
"rainfall": "6.67"
},
{
"Month":"May",
"rainfall":"3.54"
},
{
"Month":"June",
"rainfall":"3.30"
},
{
"Month":"July",
"rainfall":"3.77"
},
{
"Month":"August",
"rainfall":"0.55"
},
{
"Month":"September",
"rainfall":"2.87"
},
{
"Month":"October",
"rainfall":"1.71"
},
{
"Month":"November",
"rainfall":"6.37"
},
{
"Month":"December",
"rainfall":"5.64"
}
]
},
{
"Year": 1948,
"rainfall": [
{
"Month": "January",
"rainfall": "14.19"
},
{
"Month":"February",
"rainfall":"3.56"
},
{
"Month":"March",
"rainfall":"3.58"
},
{
"Month":"April",
"rainfall":"3.75"
},
{
"Month":"May",
"rainfall":"4.79"
},
{
"Month":"June",
"rainfall":"6.05"
},
{
"Month":"July",
"rainfall":"1.83"
},
{
"Month":"August",
"rainfall":"0.662"
},
{
"Month":"September",
"rainfall":"5.27"
},
{
"Month":"October",
"rainfall":"6.08"
},
{
"Month":"November",
"rainfall":"3.10"
},
{
"Month":"December",
"rainfall":"11.89"
}
]
},
{
"Year": 1949,
"rainfall": [
{
"Month": "January",
"rainfall": "2.02"
},
{
"Month": "February",
"rainfall": "3.33"
},
{
"Month":"March",
"rainfall":"2.63"
},
{
"Month":"April",
"rainfall":"5.45"
},
{
"Month":"May",
"rainfall":"4.45"
},
{
"Month":"June",
"rainfall":"1.09"
},
{
"Month":"July",
"rainfall":"1.05"
},
{
"Month":"August",
"rainfall":"3.57"
},
{
"Month":"September",
"rainfall":"2.95"
},
{
"Month":"October",
"rainfall":"12.28"
},
{
"Month":"November",
"rainfall":"7.71"
},
{
"Month":"December",
"rainfall":"6.44"
}
]
},
{
"Year": 1950,
"rainfall": [
{
"Month": "January",
"rainfall": "1.81"
},
{
"Month":"February",
"rainfall":"10.92"
},
{
"Month":"March",
"rainfall":"4.19"
},
{
"Month":"April",
"rainfall":"4.67"
},
{
"Month":"May",
"rainfall":"2.91"
},
{
"Month":"June",
"rainfall":"2.34"
},
{
"Month":"July",
"rainfall":"7.58"
},
{
"Month":"August",
"rainfall":"8.69"
},
{
"Month":"September",
"rainfall":"7.02"
},
{
"Month":"October",
"rainfall":"3.18"
},
{
"Month":"November",
"rainfall":"7.62"
},
{
"Month":"December",
"rainfall":"3.63"
}
]
}
]
}
{
"Totals": ["47.95","51.39","56.57","54.09","58.30","74.04","54.86","70.71","52.97","64.56","58544"]
}
And this works pretty-much perfectly.
data:image/s3,"s3://crabby-images/41d75/41d7571671ac2770dc884cff5ac88f69c0c5e85a" alt="Image showing extracted data"