Full page extraction

Enough tests - let’s try and extract all the data from a page - the station metadata, the daily rainfall observations, and the monthly totals.

The code is a straightforward extension - adding required structures for each data section. One complication is that extracting everything at once runs into Gemini’s limit on maximum output. So I’m doing it in four steps - metadata, daily obs Jan-Jun, daily obs Jul-Dec, and monthly totals.

#!/usr/bin/env python3

# Get all required data from the page

import os
import PIL.Image
import google.generativeai as genai
import typing_extensions as typing

# You will need an API key get it from https://ai.google.dev/gemini-api/docs/api-key

# I keep my API key in the .gemini_api file in my home directory.
with open("%s/.gemini_api" % os.getenv("HOME"), "r") as file:
    api_key = file.read().strip()

# Default protocol is 'GRPC' - but that is blocked by the Office firewall.
#  Use 'REST' instead.
genai.configure(api_key=api_key, transport="rest")


# Specify a structure for the station metadata
class MetaData(typing.TypedDict):
    Year: int
    StationNumber: int
    Location: str
    County: str
    Sea_level_height: int
    Gauge_diameter: int
    Gauge_height_feet: int
    Gauge_height_inches: int


# Specify a structure for the daily observations
class Daily(typing.TypedDict):
    Day: int
    rainfall: str


class Monthly(typing.TypedDict):
    Month: str
    rainfall: list[Daily]


class Annual(typing.TypedDict):
    Month: list[Monthly]


class Totals(typing.TypedDict):
    Totals: list[str]


# Load the sample image
img = PIL.Image.open(
    "../../images/jpgs_300dpi/Devon_1941-1950_RainNos_1651-1689-293.jpg"
)

# Pick an AI to use - this one is the latest as of 2025-01-29
model = genai.GenerativeModel("gemini-2.0-flash-exp")

# Get metadata from the image
result = model.generate_content(
    [
        img,
        "\n\n",
        "List the station metadata",
    ],
    generation_config=genai.GenerationConfig(
        response_mime_type="application/json", response_schema=MetaData
    ),
)
# Structured data as JSON
with open("metadata.json", "w") as file:
    file.write(result.text)
with open("metadata.txt", "w") as file:
    file.write(str(result))

# Get the daily observations from the image
# In two batches because of output size limits
result = model.generate_content(
    [
        img,
        "\n\n",
        "List the daily observations  for months January to June. "
        + "Be careful of missing data. Several days have missing data and "
        + "These days will have an entry that is blank or has a dash '-'. "
        + "Return the character '-' for missing data.",
    ],
    generation_config=genai.GenerationConfig(
        response_mime_type="application/json", response_schema=Annual
    ),
)
with open("daily1.json", "w") as file:
    file.write(result.text)
with open("daily1.txt", "w") as file:
    file.write(str(result))

result = model.generate_content(
    [
        img,
        "\n\n",
        "List the daily observations for months July to December. "
        + "Be careful of missing data. Several days have missing data and "
        + "These days will have an entry that is blank or has a dash '-'. "
        + "Return the character '-' for missing data.",
    ],
    generation_config=genai.GenerationConfig(
        response_mime_type="application/json", response_schema=Annual
    ),
)
with open("daily2.json", "w") as file:
    file.write(result.text)
with open("daily2.txt", "w") as file:
    file.write(str(result))

# Get the Monthly totals
result = model.generate_content(
    [img, "\n\n", "List the monthly totals for each month."],
    generation_config=genai.GenerationConfig(
        response_mime_type="application/json", response_schema=Totals
    ),
)
with open("totals.json", "w") as file:
    file.write(result.text)
with open("totals.txt", "w") as file:
    file.write(str(result))

Producing four JSON output files:

{
  "County": "DEVON",
  "Gauge_diameter": 5,
  "Gauge_height_feet": 1,
  "Gauge_height_inches": 3,
  "Location": "BADWORTHY COTTAGE, S. BRENT",
  "Sea_level_height": 550,
  "StationNumber": 1678,
  "Year": 1947
}
{
"Month":[
    {
        "Month":"Jan",
        "rainfall":[
            {"Day":1,"rainfall":"0.54"},
            {"Day":2,"rainfall":"0.31"},
            {"Day":3,"rainfall":"0.40"},
            {"Day":4,"rainfall":"-"},
            {"Day":5,"rainfall":"1.03"},
            {"Day":6,"rainfall":"0.43"},
            {"Day":7,"rainfall":"0.04"},
             {"Day":8,"rainfall":"0.62"},
            {"Day":9,"rainfall":"0.06"},
            {"Day":10,"rainfall":"1.24"},
            {"Day":11,"rainfall":"0.49"},
            {"Day":12,"rainfall":"0.33"},
            {"Day":13,"rainfall":"1.05"},
            {"Day":14,"rainfall":"0.11"},
            {"Day":15,"rainfall":"-"},
            {"Day":16,"rainfall":"0.15"},
             {"Day":17,"rainfall":"0.05"},
            {"Day":18,"rainfall":"-"},
            {"Day":19,"rainfall":"-"},
            {"Day":20,"rainfall":"-"},
            {"Day":21,"rainfall":"-"},
            {"Day":22,"rainfall":"-"},
             {"Day":23,"rainfall":"-"},
            {"Day":24,"rainfall":"-"},
             {"Day":25,"rainfall":"-"},
            {"Day":26,"rainfall":"-"},
             {"Day":27,"rainfall":"-"},
            {"Day":28,"rainfall":"0.02"},
            {"Day":29,"rainfall":"-"},
            {"Day":30,"rainfall":"0.63"},
            {"Day":31,"rainfall":"-"}
        ]
    },
    {
        "Month":"Feb",
        "rainfall":[
            {"Day":1,"rainfall":"1.19"},
            {"Day":2,"rainfall":"0.42"},
            {"Day":3,"rainfall":"0.67"},
            {"Day":4,"rainfall":"0.11"},
             {"Day":5,"rainfall":"-"},
            {"Day":6,"rainfall":"-"},
            {"Day":7,"rainfall":"0.29"},
            {"Day":8,"rainfall":"-"},
             {"Day":9,"rainfall":"0.17"},
            {"Day":10,"rainfall":"-"},
            {"Day":11,"rainfall":"0.01"},
            {"Day":12,"rainfall":"-"},
            {"Day":13,"rainfall":"-"},
            {"Day":14,"rainfall":"-"},
             {"Day":15,"rainfall":"1.74"},
            {"Day":16,"rainfall":"0.12"},
            {"Day":17,"rainfall":"0.90"},
            {"Day":18,"rainfall":"0.40"},
            {"Day":19,"rainfall":"0.36"},
            {"Day":20,"rainfall":"0.65"},
             {"Day":21,"rainfall":"0.17"},
             {"Day":22,"rainfall":"0.55"},
            {"Day":23,"rainfall":"0.39"},
            {"Day":24,"rainfall":"0.05"},
             {"Day":25,"rainfall":"1.00"},
            {"Day":26,"rainfall":"-"},
             {"Day":27,"rainfall":"-"},
            {"Day":28,"rainfall":"0.22"},
            {"Day":29,"rainfall":"-"},
            {"Day":30,"rainfall":"0.32"},
            {"Day":31,"rainfall":"-"}
        ]
    },
    {
        "Month":"Mar",
        "rainfall":[
            {"Day":1,"rainfall":"-"},
            {"Day":2,"rainfall":"-"},
            {"Day":3,"rainfall":"-"},
            {"Day":4,"rainfall":"0.03"},
             {"Day":5,"rainfall":"0.79"},
            {"Day":6,"rainfall":"-"},
            {"Day":7,"rainfall":"0.15"},
            {"Day":8,"rainfall":"-"},
             {"Day":9,"rainfall":"1.23"},
            {"Day":10,"rainfall":"1.08"},
            {"Day":11,"rainfall":"0.28"},
            {"Day":12,"rainfall":"2.28"},
            {"Day":13,"rainfall":"0.35"},
            {"Day":14,"rainfall":"0.17"},
            {"Day":15,"rainfall":"-"},
            {"Day":16,"rainfall":"-"},
            {"Day":17,"rainfall":"-"},
            {"Day":18,"rainfall":"-"},
            {"Day":19,"rainfall":"0.12"},
             {"Day":20,"rainfall":"1.01"},
             {"Day":21,"rainfall":"0.16"},
            {"Day":22,"rainfall":"0.18"},
             {"Day":23,"rainfall":"0.91"},
            {"Day":24,"rainfall":"0.01"},
             {"Day":25,"rainfall":"-"},
             {"Day":26,"rainfall":"-"},
            {"Day":27,"rainfall":"0.45"},
             {"Day":28,"rainfall":"-"},
            {"Day":29,"rainfall":"0.11"},
            {"Day":30,"rainfall":"0.47"},
            {"Day":31,"rainfall":"0.04"}
        ]
    },
    {
        "Month":"Apr",
        "rainfall":[
            {"Day":1,"rainfall":"0.05"},
            {"Day":2,"rainfall":"0.01"},
            {"Day":3,"rainfall":"0.56"},
            {"Day":4,"rainfall":"0.10"},
            {"Day":5,"rainfall":"1.60"},
            {"Day":6,"rainfall":"0.34"},
             {"Day":7,"rainfall":"0.32"},
            {"Day":8,"rainfall":"0.15"},
            {"Day":9,"rainfall":"-"},
             {"Day":10,"rainfall":"-"},
             {"Day":11,"rainfall":"-"},
            {"Day":12,"rainfall":"-"},
            {"Day":13,"rainfall":"-"},
            {"Day":14,"rainfall":"0.05"},
            {"Day":15,"rainfall":"0.06"},
            {"Day":16,"rainfall":"-"},
            {"Day":17,"rainfall":"0.74"},
            {"Day":18,"rainfall":"-"},
            {"Day":19,"rainfall":"-"},
            {"Day":20,"rainfall":"-"},
             {"Day":21,"rainfall":"-"},
            {"Day":22,"rainfall":"-"},
             {"Day":23,"rainfall":"-"},
            {"Day":24,"rainfall":"0.52"},
             {"Day":25,"rainfall":"0.05"},
            {"Day":26,"rainfall":"-"},
             {"Day":27,"rainfall":"-"},
            {"Day":28,"rainfall":"-"},
            {"Day":29,"rainfall":"-"},
             {"Day":30,"rainfall":"0.37"},
            {"Day":31,"rainfall":"-"}
        ]
    },
    {
        "Month":"May",
        "rainfall":[
            {"Day":1,"rainfall":"0.14"},
            {"Day":2,"rainfall":"0.45"},
            {"Day":3,"rainfall":"0.02"},
            {"Day":4,"rainfall":"0.11"},
            {"Day":5,"rainfall":"0.08"},
            {"Day":6,"rainfall":"0.12"},
             {"Day":7,"rainfall":"0.01"},
             {"Day":8,"rainfall":"-"},
            {"Day":9,"rainfall":"0.14"},
            {"Day":10,"rainfall":"-"},
            {"Day":11,"rainfall":"-"},
            {"Day":12,"rainfall":"0.23"},
            {"Day":13,"rainfall":"-"},
             {"Day":14,"rainfall":"0.52"},
            {"Day":15,"rainfall":"-"},
            {"Day":16,"rainfall":"-"},
            {"Day":17,"rainfall":"0.05"},
            {"Day":18,"rainfall":"0.04"},
            {"Day":19,"rainfall":"-"},
             {"Day":20,"rainfall":"-"},
            {"Day":21,"rainfall":"-"},
            {"Day":22,"rainfall":"-"},
            {"Day":23,"rainfall":"0.02"},
            {"Day":24,"rainfall":"-"},
            {"Day":25,"rainfall":"-"},
             {"Day":26,"rainfall":"-"},
            {"Day":27,"rainfall":"-"},
             {"Day":28,"rainfall":"-"},
            {"Day":29,"rainfall":"-"},
            {"Day":30,"rainfall":"-"},
            {"Day":31,"rainfall":"-"}
        ]
    },
    {
        "Month":"June",
        "rainfall":[
            {"Day":1,"rainfall":"-"},
            {"Day":2,"rainfall":"-"},
            {"Day":3,"rainfall":"0.42"},
            {"Day":4,"rainfall":"0.39"},
             {"Day":5,"rainfall":"1.10"},
            {"Day":6,"rainfall":"0.22"},
             {"Day":7,"rainfall":"0.02"},
            {"Day":8,"rainfall":"0.58"},
            {"Day":9,"rainfall":"0.52"},
            {"Day":10,"rainfall":"0.37"},
             {"Day":11,"rainfall":"-"},
            {"Day":12,"rainfall":"0.11"},
            {"Day":13,"rainfall":"-"},
            {"Day":14,"rainfall":"-"},
             {"Day":15,"rainfall":"-"},
            {"Day":16,"rainfall":"-"},
            {"Day":17,"rainfall":"-"},
            {"Day":18,"rainfall":"-"},
             {"Day":19,"rainfall":"-"},
            {"Day":20,"rainfall":"1.50"},
            {"Day":21,"rainfall":"-"},
            {"Day":22,"rainfall":"-"},
             {"Day":23,"rainfall":"-"},
            {"Day":24,"rainfall":"-"},
             {"Day":25,"rainfall":"0.03"},
            {"Day":26,"rainfall":"-"},
            {"Day":27,"rainfall":"0.51"},
             {"Day":28,"rainfall":"0.56"},
            {"Day":29,"rainfall":"-"},
            {"Day":30,"rainfall":"0.08"},
            {"Day":31,"rainfall":"-"}
        ]
    }
  ]
}
{
  "Month": [
    {
      "Month": "July",
      "rainfall": [
        { "Day": 1, "rainfall": "-"},
        { "Day": 2, "rainfall": "-"},
        { "Day": 3, "rainfall": "-"},
        { "Day": 4, "rainfall": "-"},
        { "Day": 5, "rainfall": "-"},
        { "Day": 6, "rainfall": "-"},
        { "Day": 7, "rainfall": "-"},
        { "Day": 8, "rainfall": "-"},
        { "Day": 9, "rainfall": "-"},
        { "Day": 10, "rainfall":"-"},
        { "Day": 11, "rainfall": "-"},
        { "Day": 12, "rainfall": "-"},
        { "Day": 13, "rainfall": "-"},
         { "Day": 14, "rainfall": "-"},
        { "Day": 15, "rainfall": "-"},
        { "Day": 16, "rainfall":"-"},
        { "Day": 17, "rainfall": "-"},
        { "Day": 18, "rainfall": "-"},
        { "Day": 19, "rainfall": "-"},
        { "Day": 20, "rainfall": "-"},
        { "Day": 21, "rainfall":"-"},
        { "Day": 22, "rainfall": "-"},
        { "Day": 23, "rainfall": "-"},
        { "Day": 24, "rainfall": "-"},
         { "Day": 25, "rainfall": "-"},
        { "Day": 26, "rainfall": "-"},
         { "Day": 27, "rainfall": "-"},
         { "Day": 28, "rainfall": "-"},
        { "Day": 29, "rainfall": "-"},
        { "Day": 30, "rainfall": "-"},
        { "Day": 31, "rainfall": "-"}
      ]
    },
    {
      "Month": "August",
      "rainfall": [
        { "Day": 1, "rainfall": "08"},
        { "Day": 2, "rainfall": "06"},
        { "Day": 3, "rainfall":"1.59"},
        { "Day": 4, "rainfall": "39"},
        { "Day": 5, "rainfall": "-"},
        { "Day": 6, "rainfall": "22"},
        { "Day": 7, "rainfall": "64"},
        { "Day": 8, "rainfall":"58"},
        { "Day": 9, "rainfall": "52"},
        { "Day": 10, "rainfall":"37"},
        { "Day": 11, "rainfall": "-"},
        { "Day": 12, "rainfall":"11"},
        { "Day": 13, "rainfall":"03"},
        { "Day": 14, "rainfall": "-"},
        { "Day": 15, "rainfall": "-"},
        { "Day": 16, "rainfall": "-"},
        { "Day": 17, "rainfall": "-"},
        { "Day": 18, "rainfall": "-"},
        { "Day": 19, "rainfall": "-"},
        { "Day": 20, "rainfall": "1.50"},
        { "Day": 21, "rainfall":"75"},
        { "Day": 22, "rainfall": "-"},
        { "Day": 23, "rainfall":"-"},
        { "Day": 24, "rainfall": "-"},
        { "Day": 25, "rainfall": "03"},
        { "Day": 26, "rainfall":"-"},
        { "Day": 27, "rainfall": "-"},
         { "Day": 28, "rainfall": "03"},
        { "Day": 29, "rainfall": "-"},
         { "Day": 30, "rainfall": "-"},
        { "Day": 31, "rainfall": "-"}
      ]
    },
    {
      "Month":"September",
       "rainfall": [
        { "Day": 1, "rainfall": "-"},
        { "Day": 2, "rainfall":"-"},
        { "Day": 3, "rainfall":"-"},
        { "Day": 4, "rainfall": "-"},
        { "Day": 5, "rainfall": "-"},
        { "Day": 6, "rainfall":"41"},
        { "Day": 7, "rainfall":"11"},
        { "Day": 8, "rainfall": "-"},
        { "Day": 9, "rainfall": "-"},
        { "Day": 10, "rainfall":"-"},
         { "Day": 11, "rainfall":"41"},
        { "Day": 12, "rainfall":"1.33"},
        { "Day": 13, "rainfall":"-"},
        { "Day": 14, "rainfall":"-"},
         { "Day": 15, "rainfall":"-"},
        { "Day": 16, "rainfall":"-"},
        { "Day": 17, "rainfall":"-"},
        { "Day": 18, "rainfall":"-"},
        { "Day": 19, "rainfall":"-"},
        { "Day": 20, "rainfall":"-"},
         { "Day": 21, "rainfall":"-"},
         { "Day": 22, "rainfall": "28"},
        { "Day": 23, "rainfall": "1.41"},
         { "Day": 24, "rainfall": "-"},
         { "Day": 25, "rainfall":"-"},
         { "Day": 26, "rainfall":"-"},
          { "Day": 27, "rainfall":"-"},
         { "Day": 28, "rainfall":"-"},
         { "Day": 29, "rainfall": "-"},
          { "Day": 30, "rainfall":"-"},
        { "Day": 31, "rainfall": "-"}
      ]
    },
    {
      "Month":"October",
       "rainfall": [
        { "Day": 1, "rainfall": "-"},
        { "Day": 2, "rainfall":"-"},
        { "Day": 3, "rainfall":"-"},
        { "Day": 4, "rainfall": "-"},
        { "Day": 5, "rainfall": "-"},
        { "Day": 6, "rainfall":"-"},
        { "Day": 7, "rainfall":"-"},
        { "Day": 8, "rainfall":"-"},
        { "Day": 9, "rainfall": "36"},
         { "Day": 10, "rainfall":"-"},
         { "Day": 11, "rainfall": "-"},
         { "Day": 12, "rainfall": "-"},
        { "Day": 13, "rainfall":"-"},
        { "Day": 14, "rainfall": "-"},
        { "Day": 15, "rainfall":"-"},
        { "Day": 16, "rainfall":"-"},
        { "Day": 17, "rainfall":"-"},
        { "Day": 18, "rainfall":"-"},
        { "Day": 19, "rainfall":"-"},
        { "Day": 20, "rainfall":"-"},
        { "Day": 21, "rainfall": "-"},
        { "Day": 22, "rainfall":"-"},
        { "Day": 23, "rainfall":"-"},
        { "Day": 24, "rainfall":"-"},
        { "Day": 25, "rainfall":"-"},
        { "Day": 26, "rainfall":"-"},
         { "Day": 27, "rainfall":"-"},
         { "Day": 28, "rainfall":"-"},
        { "Day": 29, "rainfall":"-"},
        { "Day": 30, "rainfall": "-"},
        { "Day": 31, "rainfall": "-"}
      ]
    },
    {
      "Month":"November",
       "rainfall": [
        { "Day": 1, "rainfall":"07"},
        { "Day": 2, "rainfall":"63"},
        { "Day": 3, "rainfall": "x"},
        { "Day": 4, "rainfall":"65"},
        { "Day": 5, "rainfall":"-"},
        { "Day": 6, "rainfall":"-"},
        { "Day": 7, "rainfall":"02"},
        { "Day": 8, "rainfall":"75"},
        { "Day": 9, "rainfall":"87"},
        { "Day": 10, "rainfall":"26"},
         { "Day": 11, "rainfall":"x"},
        { "Day": 12, "rainfall":"09"},
        { "Day": 13, "rainfall":"-"},
        { "Day": 14, "rainfall":"72"},
        { "Day": 15, "rainfall": "03"},
        { "Day": 16, "rainfall":"-"},
        { "Day": 17, "rainfall": "43"},
        { "Day": 18, "rainfall":"64"},
         { "Day": 19, "rainfall":"18"},
         { "Day": 20, "rainfall":"28"},
        { "Day": 21, "rainfall":"35"},
        { "Day": 22, "rainfall": "1.29"},
        { "Day": 23, "rainfall": "04"},
        { "Day": 24, "rainfall":"12"},
         { "Day": 25, "rainfall": "13"},
         { "Day": 26, "rainfall":"-"},
         { "Day": 27, "rainfall":"-"},
        { "Day": 28, "rainfall":"-"},
        { "Day": 29, "rainfall": "05"},
         { "Day": 30, "rainfall": "04"},
        { "Day": 31, "rainfall": "-"}
      ]
    },
    {
      "Month":"December",
       "rainfall": [
        { "Day": 1, "rainfall":"-"},
        { "Day": 2, "rainfall":"08"},
        { "Day": 3, "rainfall":"20"},
        { "Day": 4, "rainfall":"02"},
        { "Day": 5, "rainfall":"32"},
        { "Day": 6, "rainfall":"13"},
         { "Day": 7, "rainfall":"61"},
        { "Day": 8, "rainfall":"09"},
        { "Day": 9, "rainfall":"-"},
        { "Day": 10, "rainfall":"-"},
        { "Day": 11, "rainfall":"-"},
         { "Day": 12, "rainfall":"-"},
         { "Day": 13, "rainfall":"-"},
         { "Day": 14, "rainfall":"06"},
        { "Day": 15, "rainfall":"-"},
        { "Day": 16, "rainfall":"-"},
        { "Day": 17, "rainfall":"-"},
        { "Day": 18, "rainfall":"-"},
         { "Day": 19, "rainfall":"-"},
         { "Day": 20, "rainfall":"-"},
        { "Day": 21, "rainfall":"65"},
        { "Day": 22, "rainfall":"48"},
        { "Day": 23, "rainfall":"05"},
        { "Day": 24, "rainfall":"02"},
        { "Day": 25, "rainfall":"28"},
        { "Day": 26, "rainfall":"1.20"},
         { "Day": 27, "rainfall":"1.19"},
        { "Day": 28, "rainfall":"08"},
        { "Day": 29, "rainfall":"11"},
         { "Day": 30, "rainfall":"04"},
        { "Day": 31, "rainfall":"94"}
      ]
    }
  ]
}
{
"Totals": [
        "7.48",
        "3.44",
        "15.48",
        "5.89",
        "3.22",
        "5.62",
        "5.24",
        "2.13",
        "3.73",
        "2.17",
        "7.05",
        "7.42"
    ]
}

And does it work? Well, almost. Metadata is correct, monthly totals are correct, it’s getting the right numbers for daily obs (with a few missing decimal points - but that’s easy to fix). But the daily obs are not all associated with the correct dates - it’s a bit confused by missing data.

Image showing bounding boxes for extracted data

The full data extraction costs a bit less than 10,000 tokens. About 1/4 of a U.S. cent.