箱形图

用matplotlib可视化箱形图。

以下示例展示了如何使用Matplotlib可视化箱图。有许多选项可以控制它们的外观以及用于汇总数据的统计信息。

  1. import matplotlib.pyplot as plt
  2. import numpy as np
  3. from matplotlib.patches import Polygon
  4. # Fixing random state for reproducibility
  5. np.random.seed(19680801)
  6. # fake up some data
  7. spread = np.random.rand(50) * 100
  8. center = np.ones(25) * 50
  9. flier_high = np.random.rand(10) * 100 + 100
  10. flier_low = np.random.rand(10) * -100
  11. data = np.concatenate((spread, center, flier_high, flier_low))
  12. fig, axs = plt.subplots(2, 3)
  13. # basic plot
  14. axs[0, 0].boxplot(data)
  15. axs[0, 0].set_title('basic plot')
  16. # notched plot
  17. axs[0, 1].boxplot(data, 1)
  18. axs[0, 1].set_title('notched plot')
  19. # change outlier point symbols
  20. axs[0, 2].boxplot(data, 0, 'gD')
  21. axs[0, 2].set_title('change outlier\npoint symbols')
  22. # don't show outlier points
  23. axs[1, 0].boxplot(data, 0, '')
  24. axs[1, 0].set_title("don't show\noutlier points")
  25. # horizontal boxes
  26. axs[1, 1].boxplot(data, 0, 'rs', 0)
  27. axs[1, 1].set_title('horizontal boxes')
  28. # change whisker length
  29. axs[1, 2].boxplot(data, 0, 'rs', 0, 0.75)
  30. axs[1, 2].set_title('change whisker length')
  31. fig.subplots_adjust(left=0.08, right=0.98, bottom=0.05, top=0.9,
  32. hspace=0.4, wspace=0.3)
  33. # fake up some more data
  34. spread = np.random.rand(50) * 100
  35. center = np.ones(25) * 40
  36. flier_high = np.random.rand(10) * 100 + 100
  37. flier_low = np.random.rand(10) * -100
  38. d2 = np.concatenate((spread, center, flier_high, flier_low))
  39. data.shape = (-1, 1)
  40. d2.shape = (-1, 1)
  41. # Making a 2-D array only works if all the columns are the
  42. # same length. If they are not, then use a list instead.
  43. # This is actually more efficient because boxplot converts
  44. # a 2-D array into a list of vectors internally anyway.
  45. data = [data, d2, d2[::2, 0]]
  46. # Multiple box plots on one Axes
  47. fig, ax = plt.subplots()
  48. ax.boxplot(data)
  49. plt.show()

下面我们将从五个不同的概率分布生成数据,每个概率分布具有不同的特征。 我们想要了解数据的IID引导程序重采样如何保留原始样本的分布属性,并且箱形图是进行此评估的一种可视化工具。

  1. numDists = 5
  2. randomDists = ['Normal(1,1)', ' Lognormal(1,1)', 'Exp(1)', 'Gumbel(6,4)',
  3. 'Triangular(2,9,11)']
  4. N = 500
  5. norm = np.random.normal(1, 1, N)
  6. logn = np.random.lognormal(1, 1, N)
  7. expo = np.random.exponential(1, N)
  8. gumb = np.random.gumbel(6, 4, N)
  9. tria = np.random.triangular(2, 9, 11, N)
  10. # Generate some random indices that we'll use to resample the original data
  11. # arrays. For code brevity, just use the same random indices for each array
  12. bootstrapIndices = np.random.random_integers(0, N - 1, N)
  13. normBoot = norm[bootstrapIndices]
  14. expoBoot = expo[bootstrapIndices]
  15. gumbBoot = gumb[bootstrapIndices]
  16. lognBoot = logn[bootstrapIndices]
  17. triaBoot = tria[bootstrapIndices]
  18. data = [norm, normBoot, logn, lognBoot, expo, expoBoot, gumb, gumbBoot,
  19. tria, triaBoot]
  20. fig, ax1 = plt.subplots(figsize=(10, 6))
  21. fig.canvas.set_window_title('A Boxplot Example')
  22. fig.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.25)
  23. bp = ax1.boxplot(data, notch=0, sym='+', vert=1, whis=1.5)
  24. plt.setp(bp['boxes'], color='black')
  25. plt.setp(bp['whiskers'], color='black')
  26. plt.setp(bp['fliers'], color='red', marker='+')
  27. # Add a horizontal grid to the plot, but make it very light in color
  28. # so we can use it for reading data values but not be distracting
  29. ax1.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
  30. alpha=0.5)
  31. # Hide these grid behind plot objects
  32. ax1.set_axisbelow(True)
  33. ax1.set_title('Comparison of IID Bootstrap Resampling Across Five Distributions')
  34. ax1.set_xlabel('Distribution')
  35. ax1.set_ylabel('Value')
  36. # Now fill the boxes with desired colors
  37. boxColors = ['darkkhaki', 'royalblue']
  38. numBoxes = numDists*2
  39. medians = list(range(numBoxes))
  40. for i in range(numBoxes):
  41. box = bp['boxes'][i]
  42. boxX = []
  43. boxY = []
  44. for j in range(5):
  45. boxX.append(box.get_xdata()[j])
  46. boxY.append(box.get_ydata()[j])
  47. boxCoords = np.column_stack([boxX, boxY])
  48. # Alternate between Dark Khaki and Royal Blue
  49. k = i % 2
  50. boxPolygon = Polygon(boxCoords, facecolor=boxColors[k])
  51. ax1.add_patch(boxPolygon)
  52. # Now draw the median lines back over what we just filled in
  53. med = bp['medians'][i]
  54. medianX = []
  55. medianY = []
  56. for j in range(2):
  57. medianX.append(med.get_xdata()[j])
  58. medianY.append(med.get_ydata()[j])
  59. ax1.plot(medianX, medianY, 'k')
  60. medians[i] = medianY[0]
  61. # Finally, overplot the sample averages, with horizontal alignment
  62. # in the center of each box
  63. ax1.plot([np.average(med.get_xdata())], [np.average(data[i])],
  64. color='w', marker='*', markeredgecolor='k')
  65. # Set the axes ranges and axes labels
  66. ax1.set_xlim(0.5, numBoxes + 0.5)
  67. top = 40
  68. bottom = -5
  69. ax1.set_ylim(bottom, top)
  70. ax1.set_xticklabels(np.repeat(randomDists, 2),
  71. rotation=45, fontsize=8)
  72. # Due to the Y-axis scale being different across samples, it can be
  73. # hard to compare differences in medians across the samples. Add upper
  74. # X-axis tick labels with the sample medians to aid in comparison
  75. # (just use two decimal places of precision)
  76. pos = np.arange(numBoxes) + 1
  77. upperLabels = [str(np.round(s, 2)) for s in medians]
  78. weights = ['bold', 'semibold']
  79. for tick, label in zip(range(numBoxes), ax1.get_xticklabels()):
  80. k = tick % 2
  81. ax1.text(pos[tick], top - (top*0.05), upperLabels[tick],
  82. horizontalalignment='center', size='x-small', weight=weights[k],
  83. color=boxColors[k])
  84. # Finally, add a basic legend
  85. fig.text(0.80, 0.08, str(N) + ' Random Numbers',
  86. backgroundcolor=boxColors[0], color='black', weight='roman',
  87. size='x-small')
  88. fig.text(0.80, 0.045, 'IID Bootstrap Resample',
  89. backgroundcolor=boxColors[1],
  90. color='white', weight='roman', size='x-small')
  91. fig.text(0.80, 0.015, '*', color='white', backgroundcolor='silver',
  92. weight='roman', size='medium')
  93. fig.text(0.815, 0.013, ' Average Value', color='black', weight='roman',
  94. size='x-small')
  95. plt.show()

箱形图

在这里,我们编写一个自定义函数来引导置信区间。然后我们可以使用boxplot和此函数来显示这些间隔。

  1. def fakeBootStrapper(n):
  2. '''
  3. This is just a placeholder for the user's method of
  4. bootstrapping the median and its confidence intervals.
  5. Returns an arbitrary median and confidence intervals
  6. packed into a tuple
  7. '''
  8. if n == 1:
  9. med = 0.1
  10. CI = (-0.25, 0.25)
  11. else:
  12. med = 0.2
  13. CI = (-0.35, 0.50)
  14. return med, CI
  15. inc = 0.1
  16. e1 = np.random.normal(0, 1, size=(500,))
  17. e2 = np.random.normal(0, 1, size=(500,))
  18. e3 = np.random.normal(0, 1 + inc, size=(500,))
  19. e4 = np.random.normal(0, 1 + 2*inc, size=(500,))
  20. treatments = [e1, e2, e3, e4]
  21. med1, CI1 = fakeBootStrapper(1)
  22. med2, CI2 = fakeBootStrapper(2)
  23. medians = [None, None, med1, med2]
  24. conf_intervals = [None, None, CI1, CI2]
  25. fig, ax = plt.subplots()
  26. pos = np.array(range(len(treatments))) + 1
  27. bp = ax.boxplot(treatments, sym='k+', positions=pos,
  28. notch=1, bootstrap=5000,
  29. usermedians=medians,
  30. conf_intervals=conf_intervals)
  31. ax.set_xlabel('treatment')
  32. ax.set_ylabel('response')
  33. plt.setp(bp['whiskers'], color='k', linestyle='-')
  34. plt.setp(bp['fliers'], markersize=3.0)
  35. plt.show()

箱形图2

下载这个示例