箱形图
用matplotlib可视化箱形图。
以下示例展示了如何使用Matplotlib可视化箱图。有许多选项可以控制它们的外观以及用于汇总数据的统计信息。
import matplotlib.pyplot as pltimport numpy as npfrom matplotlib.patches import Polygon# Fixing random state for reproducibilitynp.random.seed(19680801)# fake up some dataspread = np.random.rand(50) * 100center = np.ones(25) * 50flier_high = np.random.rand(10) * 100 + 100flier_low = np.random.rand(10) * -100data = np.concatenate((spread, center, flier_high, flier_low))fig, axs = plt.subplots(2, 3)# basic plotaxs[0, 0].boxplot(data)axs[0, 0].set_title('basic plot')# notched plotaxs[0, 1].boxplot(data, 1)axs[0, 1].set_title('notched plot')# change outlier point symbolsaxs[0, 2].boxplot(data, 0, 'gD')axs[0, 2].set_title('change outlier\npoint symbols')# don't show outlier pointsaxs[1, 0].boxplot(data, 0, '')axs[1, 0].set_title("don't show\noutlier points")# horizontal boxesaxs[1, 1].boxplot(data, 0, 'rs', 0)axs[1, 1].set_title('horizontal boxes')# change whisker lengthaxs[1, 2].boxplot(data, 0, 'rs', 0, 0.75)axs[1, 2].set_title('change whisker length')fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,hspace=0.4, wspace=0.3)# fake up some more dataspread = np.random.rand(50) * 100center = np.ones(25) * 40flier_high = np.random.rand(10) * 100 + 100flier_low = np.random.rand(10) * -100d2 = np.concatenate((spread, center, flier_high, flier_low))data.shape = (-1, 1)d2.shape = (-1, 1)# Making a 2-D array only works if all the columns are the# same length. If they are not, then use a list instead.# This is actually more efficient because boxplot converts# a 2-D array into a list of vectors internally anyway.data = [data, d2, d2[::2, 0]]# Multiple box plots on one Axesfig, ax = plt.subplots()ax.boxplot(data)plt.show()
下面我们将从五个不同的概率分布生成数据,每个概率分布具有不同的特征。 我们想要了解数据的IID引导程序重采样如何保留原始样本的分布属性,并且箱形图是进行此评估的一种可视化工具。
numDists = 5randomDists = ['Normal(1,1)', ' Lognormal(1,1)', 'Exp(1)', 'Gumbel(6,4)','Triangular(2,9,11)']N = 500norm = np.random.normal(1, 1, N)logn = np.random.lognormal(1, 1, N)expo = np.random.exponential(1, N)gumb = np.random.gumbel(6, 4, N)tria = np.random.triangular(2, 9, 11, N)# Generate some random indices that we'll use to resample the original data# arrays. For code brevity, just use the same random indices for each arraybootstrapIndices = np.random.random_integers(0, N - 1, N)normBoot = norm[bootstrapIndices]expoBoot = expo[bootstrapIndices]gumbBoot = gumb[bootstrapIndices]lognBoot = logn[bootstrapIndices]triaBoot = tria[bootstrapIndices]data = [norm, normBoot, logn, lognBoot, expo, expoBoot, gumb, gumbBoot,tria, triaBoot]fig, ax1 = plt.subplots(figsize=(10, 6))fig.canvas.set_window_title('A Boxplot Example')fig.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.25)bp = ax1.boxplot(data, notch=0, sym='+', vert=1, whis=1.5)plt.setp(bp['boxes'], color='black')plt.setp(bp['whiskers'], color='black')plt.setp(bp['fliers'], color='red', marker='+')# Add a horizontal grid to the plot, but make it very light in color# so we can use it for reading data values but not be distractingax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',alpha=0.5)# Hide these grid behind plot objectsax1.set_axisbelow(True)ax1.set_title('Comparison of IID Bootstrap Resampling Across Five Distributions')ax1.set_xlabel('Distribution')ax1.set_ylabel('Value')# Now fill the boxes with desired colorsboxColors = ['darkkhaki', 'royalblue']numBoxes = numDists*2medians = list(range(numBoxes))for i in range(numBoxes):box = bp['boxes'][i]boxX = []boxY = []for j in range(5):boxX.append(box.get_xdata()[j])boxY.append(box.get_ydata()[j])boxCoords = np.column_stack([boxX, boxY])# Alternate between Dark Khaki and Royal Bluek = i % 2boxPolygon = Polygon(boxCoords, facecolor=boxColors[k])ax1.add_patch(boxPolygon)# Now draw the median lines back over what we just filled inmed = bp['medians'][i]medianX = []medianY = []for j in range(2):medianX.append(med.get_xdata()[j])medianY.append(med.get_ydata()[j])ax1.plot(medianX, medianY, 'k')medians[i] = medianY[0]# Finally, overplot the sample averages, with horizontal alignment# in the center of each boxax1.plot([np.average(med.get_xdata())], [np.average(data[i])],color='w', marker='*', markeredgecolor='k')# Set the axes ranges and axes labelsax1.set_xlim(0.5, numBoxes + 0.5)top = 40bottom = -5ax1.set_ylim(bottom, top)ax1.set_xticklabels(np.repeat(randomDists, 2),rotation=45, fontsize=8)# Due to the Y-axis scale being different across samples, it can be# hard to compare differences in medians across the samples. Add upper# X-axis tick labels with the sample medians to aid in comparison# (just use two decimal places of precision)pos = np.arange(numBoxes) + 1upperLabels = [str(np.round(s, 2)) for s in medians]weights = ['bold', 'semibold']for tick, label in zip(range(numBoxes), ax1.get_xticklabels()):k = tick % 2ax1.text(pos[tick], top - (top*0.05), upperLabels[tick],horizontalalignment='center', size='x-small', weight=weights[k],color=boxColors[k])# Finally, add a basic legendfig.text(0.80, 0.08, str(N) + ' Random Numbers',backgroundcolor=boxColors[0], color='black', weight='roman',size='x-small')fig.text(0.80, 0.045, 'IID Bootstrap Resample',backgroundcolor=boxColors[1],color='white', weight='roman', size='x-small')fig.text(0.80, 0.015, '*', color='white', backgroundcolor='silver',weight='roman', size='medium')fig.text(0.815, 0.013, ' Average Value', color='black', weight='roman',size='x-small')plt.show()

在这里,我们编写一个自定义函数来引导置信区间。然后我们可以使用boxplot和此函数来显示这些间隔。
def fakeBootStrapper(n):'''This is just a placeholder for the user's method ofbootstrapping the median and its confidence intervals.Returns an arbitrary median and confidence intervalspacked into a tuple'''if n == 1:med = 0.1CI = (-0.25, 0.25)else:med = 0.2CI = (-0.35, 0.50)return med, CIinc = 0.1e1 = np.random.normal(0, 1, size=(500,))e2 = np.random.normal(0, 1, size=(500,))e3 = np.random.normal(0, 1 + inc, size=(500,))e4 = np.random.normal(0, 1 + 2*inc, size=(500,))treatments = [e1, e2, e3, e4]med1, CI1 = fakeBootStrapper(1)med2, CI2 = fakeBootStrapper(2)medians = [None, None, med1, med2]conf_intervals = [None, None, CI1, CI2]fig, ax = plt.subplots()pos = np.array(range(len(treatments))) + 1bp = ax.boxplot(treatments, sym='k+', positions=pos,notch=1, bootstrap=5000,usermedians=medians,conf_intervals=conf_intervals)ax.set_xlabel('treatment')ax.set_ylabel('response')plt.setp(bp['whiskers'], color='k', linestyle='-')plt.setp(bp['fliers'], markersize=3.0)plt.show()

