Script to plot successes and failures¶

#!/usr/bin/env python

# Overplot transcription results on the original image.
# Show successful, erronious, and missing results.

import argparse
import pickle
from PIL import Image

import matplotlib
from matplotlib.backends.backend_agg import \
             FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure
import matplotlib.patches
from matplotlib.cm import Set2
import numpy
from scipy.stats import gaussian_kde
import pandas

from protobuf_to_dict import protobuf_to_dict

parser = argparse.ArgumentParser()
parser.add_argument("--source", help="Image file name",
                    type=str,default='modified.jpg')
parser.add_argument("--pickle", help="Pickled GCV results file name",
                    type=str,default='detection.pkl')
parser.add_argument("--benchmark", help="Benchmark results file name",
                    type=str,required=True)
parser.add_argument("--ndays", help="How many days in this month",
                    type=int,required=True)
parser.add_argument("--guides", help="Plot the day+hour guidelines",
                    type=bool,default=False)
parser.add_argument("--opimg", help="Image output file name",
                    default="oplot.png",
                    type=str,required=False)
parser.add_argument("--opstats", help="Statistics output file name",
                    default="stats.pkl",
                    type=str,required=False)
args = parser.parse_args()

# We're going to need the original image
im = Image.open(args.source)
imwidth, imheight = im.size

# And the benchmark data
bm = pandas.read_csv(args.benchmark)

fig=Figure(figsize=((1*im.size[0]/100)*1.04,
                    (1*im.size[1]/100)*1.04),
       dpi=100,
       facecolor=(0.88,0.88,0.88,1),
       edgecolor=None,
       linewidth=0.0,
       frameon=False,
       subplotpars=None,
       tight_layout=None)
ax_original=fig.add_axes([0.02,0.02,0.96,0.96],label='original')
ax_result=fig.add_axes([0.02,0.02,0.96,0.96],label='result')
# Matplotlib magic
canvas=FigureCanvas(fig)
# Turn off the axis tics
ax_original.set_axis_off()
ax_result.set_axis_off()

# Show the original image
ax_original.imshow(im)

# Load the JSON from GCV for this image
document=pickle.load( open( args.pickle, "rb" ) )

# Convert it to a dict so we can modify it
document=protobuf_to_dict(document)

# GV is bad at deciding which characters form part of the same word
#  patch this up a bit.
def merge_words(w1,w2):
    w1['symbols']=w1['symbols']+w2['symbols']
    w1['bounding_box']['vertices'][0]['x']=min(w1['bounding_box']['vertices'][0]['x'],
                                               w2['bounding_box']['vertices'][0]['x'])
    w1['bounding_box']['vertices'][0]['y']=min(w1['bounding_box']['vertices'][0]['y'],
                                               w2['bounding_box']['vertices'][0]['y'])
    w1['bounding_box']['vertices'][1]['x']=max(w1['bounding_box']['vertices'][1]['x'],
                                               w2['bounding_box']['vertices'][1]['x'])
    w1['bounding_box']['vertices'][1]['y']=min(w1['bounding_box']['vertices'][1]['y'],
                                               w2['bounding_box']['vertices'][1]['y'])
    w1['bounding_box']['vertices'][2]['x']=max(w1['bounding_box']['vertices'][2]['x'],
                                               w2['bounding_box']['vertices'][2]['x'])
    w1['bounding_box']['vertices'][2]['y']=max(w1['bounding_box']['vertices'][2]['y'],
                                               w2['bounding_box']['vertices'][2]['y'])
    w1['bounding_box']['vertices'][3]['x']=min(w1['bounding_box']['vertices'][3]['x'],
                                               w2['bounding_box']['vertices'][3]['x'])
    w1['bounding_box']['vertices'][3]['y']=max(w1['bounding_box']['vertices'][3]['y'],
                                               w2['bounding_box']['vertices'][3]['y'])
    return(w1)

# Most of the numbers we want are split into three words - merge them
def reword(doc):
     for page in document['pages']:
        for block in page['blocks']:
            for paragraph in block['paragraphs']:
                paragraph['newwords']=[]
                last_word=None
                for word in paragraph['words']:
                    if (last_word is not None and
                        len(word['symbols'])==3 and
                        not last_word['symbols'][0]['text'].isdigit()):
                        try:
                            word=merge_words(last_word,word)
                        except KeyError:
                            word=word
                        paragraph['newwords'].append(word)
                        last_word=None
                    else:
                        if last_word is not None:
                            paragraph['newwords'].append(last_word)
                        last_word=word
                if last_word is not None:
                    paragraph['newwords'].append(last_word)
                paragraph['threewords']=[]
                last_word=None
                for word in paragraph['newwords']:
                    if (last_word is not None and
                        len(word['symbols'])==4 and
                        last_word['symbols'][0]['text'].isdigit()):
                        try:
                            word=merge_words(last_word,word)
                        except KeyError:
                            word=word
                        paragraph['threewords'].append(word)
                        last_word=None
                    else:
                        if last_word is not None:
                            paragraph['threewords'].append(last_word)
                        last_word=word
                if last_word is not None:
                    paragraph['threewords'].append(last_word)
     return(document)
 
document=reword(document)

# Convert box vertex list to numpy array for matplotlib
def bb2p(bb):
    result=numpy.zeros((len(bb['vertices']),2))
    for idx in range(len(bb['vertices'])):
        result[idx,0]=bb['vertices'][idx]['x']/imwidth
        result[idx,1]=1.0-bb['vertices'][idx]['y']/imheight
    return result

# Get bounding box centroid for text
def b2t(bb):
    vertices=bb2p(bb)
    result=[0,0]
    result[0]=numpy.mean(vertices[:,0])
    result[1]=numpy.mean(vertices[:,1])
    return result

# Make fake bounding box around point for missing items
def m2p(x,y):
    result=numpy.zeros((4,2))
    result[0,0]=x-0.014
    result[0,1]=y-0.008
    result[1,0]=x-0.014
    result[1,1]=y+0.008
    result[2,0]=x+0.014
    result[2,1]=y+0.008
    result[3,0]=x+0.014
    result[3,1]=y-0.008
    return result

# Get bounding box metrics
def bb_width(bb):
    vertices=bb2p(bb)
    width=numpy.max(vertices[:,0])-numpy.min(vertices[:,0])
    return width

def bb_height(bb):
    vertices=bb2p(bb)
    height=numpy.max(vertices[:,1])-numpy.min(vertices[:,1])
    return height

def bb_left(bb):
    vertices=bb2p(bb)
    left=numpy.min(vertices[:,0])
    return left

def bb_right(bb):
    vertices=bb2p(bb)
    right=numpy.max(vertices[:,0])
    return right

def bb_top(bb):
    vertices=bb2p(bb)
    top=numpy.max(vertices[:,1])
    return top

def bb_bottom(bb):
    vertices=bb2p(bb)
    bottom=numpy.min(vertices[:,1])
    return bottom

# Do a few obvious fix-ups on the GCV output
def fixup(text):
    # Insert the decimal point where it might have been missed
    if len(text)==4:
       text=text[0]+'.'+text[1:]
    # Replace any non-digit with decimal point
    for id in range(len(text)):
        if not text[id].isdigit(): 
            text=text[0:id]+'.'+text[id+1:]
    return text

# Get the characters comprising a word as a single string
def getWordString(word):
    ws=''
    for symbol in word['symbols']:
        ws=ws+symbol['text']
    return ws


def hasNumbers(word):
    return any(char.isdigit() for char in getWordString(word))

# Find the column locations corresponding to hours, 
#  and the row locations corresponding to days
xc=[]
yc=[]
for page in document['pages']:
    for block in page['blocks']:
        for paragraph in block['paragraphs']:
            for word in paragraph['threewords']:
                try:
                    if not hasNumbers(word): continue
                    cent=b2t(word['bounding_box'])
                    xc.append(cent[0])
                    yc.append(cent[1])
                except KeyError:
                    continue

# Find clusters of points in a and y - these are the rows and columns
def get_columns(xc):
    kde = gaussian_kde(xc,bw_method=0.03)
    x_grid=numpy.linspace(0,1,1000)
    dens=kde.evaluate(x_grid)
    col=[]
    for i in range(1,len(x_grid)-1):
        if (dens[i]>1 and 
            dens[i]>dens[i-1] and
            dens[i]>dens[i+1]):
            col.append(x_grid[i])
    return(col)

columns=get_columns(xc)
# If fewer than 26 columns we've lost some in the fold, add it back
while len(columns)<26:
   columns.insert(0,columns[0]-(columns[1]-columns[0])) 
columns=columns[1:(len(columns)-1)]
rows=get_columns(yc)
# Image might or might not have a row of hour numbers
if len(rows)==args.ndays+2:
    rows=rows[1:(len(rows)-1)]
else:
    rows=rows[1:(len(rows))]

# Draw the breaks
if args.guides==True:
    for clm in columns:
        ax_result.add_line(matplotlib.lines.Line2D(
                xdata=(clm,clm),
                ydata=(0,1),
                linestyle='solid',
                linewidth=1,
                color=(1,0,0,1),
                zorder=5))
    for row in rows:
        ax_result.add_line(matplotlib.lines.Line2D(
                xdata=(0,1),
                ydata=(row,row),
                linestyle='solid',
                linewidth=1,
                color=(1,0,0,1),
                zorder=5))

# Keep track of good, bad, and missing data
stats={'Total':   0,
       'Good':    0,
       'Bad':     0,
       'Missing': 0}

# Make 2d array of blocks - for each row/column intersection
data_points=[[None] * len(columns) for i in range(len(rows))]
for page in document['pages']:
    for block in page['blocks']:
        for paragraph in block['paragraphs']:
            for word in paragraph['threewords']:
                try:
                    if (bb_width(word['bounding_box']) > 0.045 or
                        bb_width(word['bounding_box']) < 0.015 or
                        bb_height(word['bounding_box']) > 0.03 or
                        bb_height(word['bounding_box']) < 0.001): continue
                    at_column=None
                    for cmi in range(len(columns)):
                        clm=columns[cmi]
                        if (bb_left(word['bounding_box'])<clm and
                            bb_right(word['bounding_box'])>clm):
                            at_column=cmi
                            break
                    if at_column is None: continue
                    for rwi in range(len(rows)):
                        row=rows[rwi]
                        if (bb_top(word['bounding_box'])>row and
                            bb_bottom(word['bounding_box'])<row):
                            data_points[rwi][at_column]=word
                            break
                except KeyError:
                    continue
# Draw a marker at every hour - either a text block or a missing marker
for rwi in range(len(rows)):
   for cmi in range(len(columns)):
      try:
          stats['Total'] += 1
          if data_points[rwi][cmi] is None:
            # Mark as missing
               stats['Missing'] += 1
               pp=matplotlib.patches.Polygon(m2p(columns[cmi],rows[rwi]),
                                             closed=True,
                                             fill=False, 
                                             hatch='/',
                                             edgecolor=(1,0,0,1),
                                             facecolor=(1,0,0,1),
                                             linewidth=1.0,
                                             alpha=1.0,
                                             zorder=10)
               ax_result.add_patch(pp)
          else:
              word=data_points[rwi][cmi]
              # Compare the GCV value with the benchmark
              try:
                  bmvalue=bm.iloc[len(rows)-rwi-1][cmi+1].replace("'","")
              except:
                  bmvalue=0.0
              if fixup(getWordString(word))==bmvalue:
                  stats['Good'] += 1 
                  bcol=(0.7,1,0.7,1)
              else:
                  stats['Bad'] += 1
                  bcol=(1,0.5,0.5,1)
              pp=matplotlib.patches.Polygon(bb2p(word['bounding_box']),
                                            closed=True,
                                            edgecolor=bcol,
                                            facecolor=bcol,
                                            fill=True,
                                            linewidth=0.2,
                                            alpha=0.85,
                                            zorder=10)
              ax_result.add_patch(pp)
           # Text
              txt_centroid=b2t(word['bounding_box'])
              angle=0
              ax_result.text(txt_centroid[0],txt_centroid[1],
                             fixup(getWordString(word)),
                             fontsize=18,
                             verticalalignment='center',
                             horizontalalignment='center',
                             rotation=angle,
                             zorder=10)
      except KeyError:
          continue

# Draw the image
fig.savefig(args.opimg)

# Output the stats
print(stats)
with open( args.opstats, "wb" ) as pkf:
    pickle.dump(stats, pkf )
Table of Contents

Get the source code

Found a bug, or have a suggestion?

Script to plot successes and failures¶