Tuesday, January 17, 2012

Customized Boxplots with Matplotlib

Recently, I wanted to use a box plot to describe some data I had collected. I had a program whose response latency depended on a variable k and I wanted to show some information about the distribution of latencies for each value of k that I tested. Box plots, in my opinion, are perfect for this kind of display.

My toolkit of choice for plotting or graphing data happens to be the combination of numpy, matplotlib, and scipy. I immediately found the function I was looking for: matplotlib.pyplot.boxplot. Unfortunately, I wanted the box plot to show information about the 99th percentile, and woe, this function will only draw whiskers based on the IQR.

So, I wrote my own box plot implementation, that's a little bit more generic and most importantly met my needs.

# @author: Aaron Blankstein 

from scipy.stats import scoreatpercentile

class boxplotter(object):
    def __init__(self, median, top, bottom, whisk_top=None, 
                 whisk_bottom=None):
        self.median = median
        self.top = top
        self.bott = bottom
        self.whisk_top = whisk_top
        self.whisk_bott = whisk_bottom
    def draw_on(self, ax, index, box_color = "blue", 
                median_color = "red", whisker_color = "black"):
        width = .7
        w2 = width / 2
        ax.broken_barh([(index - w2, width)],
                       (self.bott,self.top - self.bott), 
                       facecolor="white",edgecolor=box_color)
        ax.broken_barh([(index - w2, width)],
                       (self.median,0), 
                       facecolor="white", edgecolor=median_color)
        if self.whisk_top is not None:
            ax.broken_barh([(index - w2, width)],
                           (self.whisk_top,0), 
                           facecolor="white", edgecolor=whisker_color)
            ax.broken_barh([(index , 0)], 
                           (self.whisk_top, self.top-self.whisk_top),
                           edgecolor=box_color,linestyle="dashed")
        if self.whisk_bott is not None:
            ax.broken_barh([(index - w2, width)],
                           (self.whisk_bott,0), 
                           facecolor="white", edgecolor=whisker_color)
            ax.broken_barh([(index , 0)], 
                           (self.whisk_bott,self.bott-self.whisk_bott),
                           edgecolor=box_color,linestyle="dashed")

def percentile_box_plot(ax, data, indexer=None, box_top=75, 
                        box_bottom=25,whisker_top=99,whisker_bottom=1):
    if indexer is None:
        indexed_data = zip(range(1,len(data)+1), data)
    else:
        indexed_data = [(indexer(datum), datum) for datum in data]
    def get_whisk(vector, w):
        if w is None:
            return None
        return scoreatpercentile(vector, w)

    for index, x in indexed_data:
        bp = boxplotter(scoreatpercentile(x, 50),
                        scoreatpercentile(x, box_top),
                        scoreatpercentile(x, box_bottom),
                        get_whisk(x, whisker_top),
                        get_whisk(x, whisker_bottom))
        bp.draw_on(ax, index)

def example():

    from pylab import rand, ones, concatenate
    import matplotlib.pyplot as plt
    # EXAMPLE data code from: 
    # http://matplotlib.sourceforge.net/pyplots/boxplot_demo.py
    # fake up some data
    spread= rand(50) * 100
    center = ones(25) * 50
    flier_high = rand(10) * 100 + 100
    flier_low = rand(10) * -100
    data =concatenate((spread, center, flier_high, flier_low), 0)
    # fake up some more data
    spread= rand(50) * 100
    center = ones(25) * 40
    flier_high = rand(10) * 100 + 100
    flier_low = rand(10) * -100
    d2 = concatenate( (spread, center, flier_high, flier_low), 0 )
    data.shape = (-1, 1)
    d2.shape = (-1, 1)
    data = [data, d2, d2[::2,0]]

    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.set_xlim(0,4)
    percentile_box_plot(ax, data)
    plt.savefig('example.png')
 
if __name__ == "__main__":
    example()


The example() method produced the lovely box plot above. If you supply None arguments to either of the whiskers, it won't draw that particular whisker. Anyways, happy plotting.