02 气泡图3.png 横坐标依然是面积,纵坐标依然是人口

从对散点图的数据解读来看,为散点添加不同的颜色是增加图像中的信息维度
而气泡图也是一样:通过给散点增加面积信息,来增加图像中的信息维度

  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib as mpl
  4. import matplotlib.pyplot as plt
  5. import seaborn as sns
  6. %matplotlib inline
  1. midwest = pd.read_csv("midwest_filter.csv")
  1. midwest.head()
PID county state area poptotal popdensity popwhite popblack popamerindian popasian percprof poppovertyknown percpovertyknown percbelowpoverty percchildbelowpovert percadultpoverty percelderlypoverty inmetro category dot_size
0 561 ADAMS IL 0.052 66090 1270.961540 63917 1702 98 249 4.355859 63628 96.274777 13.151443 18.011717 11.009776 12.443812 0 AAR 250.944411
1 562 ALEXANDER IL 0.014 10626 759.000000 7054 3496 19 48 2.870315 10529 99.087145 32.244278 45.826514 27.385647 25.228976 0 LHR 185.781260
2 563 BOND IL 0.022 14991 681.409091 14477 429 35 16 4.488572 14235 94.956974 12.068844 14.036061 10.852090 12.697410 0 AAR 175.905385
3 564 BOONE IL 0.017 30806 1812.117650 29344 127 46 150 4.197800 30337 98.477569 7.209019 11.179536 5.536013 6.217047 1 ALU 319.823487
4 565 BROWN IL 0.018 5836 324.222222 5264 547 14 5 3.367680 4815 82.505140 13.520249 13.022889 11.143211 19.200000 0 AAR 130.442161

5 rows × 29 columns

  1. midwest.columns
  1. Index(['PID', 'county', 'state', 'area', 'poptotal', 'popdensity', 'popwhite',
  2. 'popblack', 'popamerindian', 'popasian', 'popother', 'percwhite',
  3. 'percblack', 'percamerindan', 'percasian', 'percother', 'popadults',
  4. 'perchsd', 'percollege', 'percprof', 'poppovertyknown',
  5. 'percpovertyknown', 'percbelowpoverty', 'percchildbelowpovert',
  6. 'percadultpoverty', 'percelderlypoverty', 'inmetro', 'category',
  7. 'dot_size'],
  8. dtype='object')
  1. #预设图像的各种属性
  2. large = 22; med = 16; small = 12
  3. params = {'axes.titlesize': large, #子图上的标题字体大小
  4. 'legend.fontsize': med, #图例的字体大小
  5. 'figure.figsize': (16, 10), #图像的画布大小
  6. 'axes.labelsize': med, #标签的字体大小
  7. 'xtick.labelsize': med, #x轴上的标尺的字体大小
  8. 'ytick.labelsize': med, #y轴上的标尺的字体大小
  9. 'figure.titlesize': large} #整个画布的标题字体大小
  10. plt.rcParams.update(params) #设定各种各样的默认属性
  11. #plt.style.use('seaborn-whitegrid') #设定整体风格
  12. #sns.set_style("white") #设定整体背景风格
  1. categories = np.unique(midwest['category'])
  2. colors = [plt.cm.tab10(i/float(len(categories) - 1)) for i in range(len(categories))]
  3. fig = plt.figure(figsize = (14, 8), dpi = 120, facecolor = 'w', edgecolor = 'k')
  4. #循环绘图
  5. #之前在给散点加入颜色的时候,提到X轴,Y轴上的值和颜色是一一对应的
  6. #那只要点的尺寸和我们的坐标点(x1,x2)一一对应,就可以相应地给每一个点添加尺寸信息
  7. for i, category in enumerate(categories):
  8. plt.scatter('area', 'poptotal'
  9. , data = midwest.loc[midwest.category == category, :]
  10. , c = np.array(colors[i]).reshape(1, -1)
  11. , label = str(category)
  12. , s = midwest.loc[midwest.category == category, 'percasian'] * 500 # 以“亚洲人口占比”特征作为点的尺寸的大小
  13. , alpha = 0.6
  14. , edgecolors = np.array(colors[i]).reshape(1, -1)
  15. , linewidths = 3 # 点的外圈的线条的宽度
  16. )
  17. plt.legend()
  1. <matplotlib.legend.Legend at 0x1c77f42b320>

output_6_1.png

  1. midwest.loc[midwest.category == category, 'percasian']
  1. 140 0.238197
  2. 249 0.121292
  3. Name: percasian, dtype: float64
  1. plt.figure(figsize = (16, 10))
  2. for i in range(len(categories)):
  3. plt.scatter(midwest.loc[midwest['category'] == categories[i], 'area'],
  4. midwest.loc[midwest['category'] == categories[i], 'poptotal'],
  5. s = midwest.loc[midwest['category'] == categories[i], 'percasian'] * 500,
  6. color = plt.cm.tab10(i/float(len(categories) + 1)),
  7. label = categories[i])
  8. plt.legend()
  1. <matplotlib.legend.Legend at 0x1c77f7c3e80>

output_8_1.png

  1. plt.figure(figsize = (16, 10))
  2. for i in range(len(categories)):
  3. plt.scatter(midwest.loc[midwest['category'] == categories[i], 'area'],
  4. midwest.loc[midwest['category'] == categories[i], 'poptotal'],
  5. s = midwest.loc[:, 'percasian'] * 500, # 索引所有行,没有报错,说明参数s可以输入和坐标点长度不一致的序列
  6. color = plt.cm.tab10(i/float(len(categories) + 1)),
  7. label = categories[i])
  8. plt.legend()
  1. <matplotlib.legend.Legend at 0x1c77f84bac8>

output_9_1.png

  1. print(midwest.loc[midwest['category'] == categories[0], 'area'].shape)
  2. print(midwest.loc[midwest['category'] == categories[0], 'poptotal'].shape)
  3. print(midwest.loc[midwest['category'] == categories[0], 'percasian'].shape)
  4. print(midwest.loc[:, 'percasian'].shape)
  1. (186,)
  2. (186,)
  3. (186,)
  4. (332,)

参数s中可能出现的陷阱

  1. 如果输入了比原始数据更长的序列,参数只会截取到和原始数据一样长的对应的尺寸
  2. 如果输入了比原始数据更短的序列,参数会自动补全

相比之下,c参数更不容易出错,遇到这种情况会报错

  1. X = np.arange(0, 10, 1)
  2. Y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
  3. size1 = [50, 50, 50, 50, 50, 100, 100, 100, 100, 100]
  4. size2 = [50, 50, 50, 50, 50, 100, 100, 100, 100, 100, 200, 200, 200]
  5. size3 = [50, 50, 50, 50, 50, 100, 100, 200, 200, 200, 100, 100, 100]
  6. size4 = [50, 100, 200]
  1. plt.figure(figsize = (20, 4))
  2. plt.subplot(1, 5, 1)
  3. plt.scatter(X, X)
  4. plt.subplot(1, 5, 2)
  5. plt.scatter(X, X,
  6. s = size1,
  7. c = Y)
  8. plt.subplot(1, 5, 3)
  9. plt.scatter(X, X,
  10. s = size2,
  11. c = Y)
  12. plt.subplot(1, 5, 4)
  13. plt.scatter(X, X,
  14. s = size3, # 如果输入了比原始数据更长的序列,参数只会截取到和横坐标、纵坐标一样长的对应的尺寸
  15. c = Y)
  16. plt.subplot(1, 5, 5)
  17. plt.scatter(X, X,
  18. s = size4, # 如果输入了比原始数据更短的序列,参数会自动补全
  19. c = Y)
  1. <matplotlib.collections.PathCollection at 0x1c77ff227f0>

output_13_1.png

图例

气泡图的图例如何统一大小?

气泡图的图例是用来表示数据的大小等信息,而不是用来表示不同的颜色/分类

别人家的气泡图1111.png别人家的气泡图2222.png

控制图例大小

  1. #准备标签列表
  2. categories = np.unique(midwest['category'])
  3. colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))]
  4. #布置画布
  5. plt.figure(figsize = (10, 8))
  6. for i in range(len(categories)):
  7. plt.scatter(midwest.loc[midwest['category'] == categories[i], 'area'],
  8. midwest.loc[midwest['category'] == categories[i], 'poptotal'],
  9. s = midwest.loc[midwest['category'] == categories[i], 'percasian'] * 500,
  10. color = plt.cm.tab10(i/float(len(categories) + 1)),
  11. label = categories[i])
  12. plt.legend()
  13. #装饰图像
  14. plt.gca().set(xlim = (0.0, 0.12), ylim = (0, 90000),
  15. xlabel = 'Area', ylabel = 'Population')
  16. plt.xticks(fontsize = 12)
  17. plt.yticks(fontsize = 12)
  18. plt.title("Bubble Plot with Encircling", fontsize = 22)
  19. plt.legend(fontsize = 12
  20. ,markerscale = 0.05 #现有的图例气泡的某个比例
  21. )
  1. <matplotlib.legend.Legend at 0x1c701477e10>

output_16_1.png

在气泡上显示文字信息

  1. X = np.arange(0, 10, 1)
  2. Y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
  3. size1 = [50, 50, 50, 50, 50, 100, 100, 100, 100, 100]
  4. plt.figure(figsize = (6, 4))
  5. plt.scatter(X, X,
  6. s = size1,
  7. c = Y
  8. )
  9. plt.text(X[3] + 0.08, X[3] + 0.08,
  10. s = 'ha', #不是size的s,而是我们的字符串string的简称s
  11. fontdict = {'fontsize':18})
  1. Text(3.08, 3.08, 'ha')

output_18_1.png

  1. #准备标签列表
  2. colors = ['red', 'orange', 'pink']
  3. #布置画布
  4. plt.figure(figsize = (14, 8), dpi = 120, facecolor = 'w', edgecolor = 'k')
  5. # 循环绘图
  6. for i, category in enumerate(['AHR', 'HAU', 'LHU']):
  7. data_ = midwest.loc[midwest['category'] == category, :]
  8. data_.index = range(data_.shape[0])
  9. plt.scatter('area', 'poptotal', data = data_,
  10. s = midwest.loc[midwest['category'] == category, 'poppovertyknown'] * 0.05, #调整尺寸,让散点图成为气泡图
  11. color = colors[i],
  12. label = str(category),
  13. edgecolors = colors[i],
  14. alpha = 0.7,
  15. linewidths = .5)
  16. for i in range(midwest.loc[midwest['category'] == category, :].shape[0]):
  17. plt.text(data_.loc[i, 'area'],
  18. data_.loc[i, 'poptotal'],
  19. s = data_.loc[i, 'category'],
  20. fontdict = {"fontsize" : 8},
  21. horizontalalignment = 'right' # 气泡对字符串的相对位置
  22. )
  23. plt.legend()
  24. #装饰图像
  25. plt.gca().set(xlim = (0.0, 0.1), ylim = (0, 90000),
  26. xlabel = 'Area', ylabel = 'Population')
  27. plt.xticks(fontsize = 12)
  28. plt.yticks(fontsize = 12)
  29. plt.title("Bubble Plot with Encircling", fontsize = 22)
  1. Text(0.5, 1.0, 'Bubble Plot with Encircling')

output_19_1.png

绘制凸包

将属于某一类别的散点框起来,显示这一组点的最大轮廓
凸包 vs 凹包.png
SciPy库:是一个专为Python设计的,专注于数学&工程学的库
SciPy的spatial:SciPy中专门处理空间算法和数据结构的模块

  1. np.random.seed(1)
  2. # random.randn() 取出符合标准正态分布的随机数,均值为0,方差为1
  3. # random.normal() 取出符合正态分布的随机数
  4. x1, y1 = np.random.normal(loc = 5 # 均值,决定了正态分布图形的位置
  5. , scale = 2 # 方差,影响正态分布的波动范围
  6. , size = (2, 15) # 生成的数据结构:2列15行
  7. )
  1. print(np.random.normal(loc = 5, scale = 2, size = (2, 15)).shape)
  2. print(np.random.randn(2, 15).shape)
  1. (2, 15)
  2. (2, 15)
  1. x1, y1
  1. (array([8.24869073, 3.77648717, 3.9436565 , 2.85406276, 6.73081526,
  2. 0.39692261, 8.48962353, 3.4775862 , 5.63807819, 4.50125925,
  3. 7.92421587, 0.87971858, 4.35516559, 4.23189129, 7.26753888]),
  4. array([2.80021747, 4.65514358, 3.24428316, 5.08442749, 6.16563043,
  5. 2.79876165, 7.28944742, 6.80318144, 6.00498868, 6.8017119 ,
  6. 3.63254428, 4.75421955, 3.12846113, 4.46422384, 6.06071093]))
  1. x2, y2 = np.random.normal(loc = 8 # 均值,决定了正态分布图形的位置
  2. , scale = 2.5 # 方差,影响正态分布的波动范围
  3. , size = (2, 13) # 生成的数据结构:2列15行
  4. )
  1. plt.figure(figsize = (5, 3))
  2. plt.scatter(x1, y1)
  3. plt.scatter(x2, y2)
  1. <matplotlib.collections.PathCollection at 0x1c703b10b70>

output_25_1.png

scipy.spatial.ConvexHull()

ConvexHull直译是凸包,表示在一个平面上,我们能找到的最小的将一组数据全部包括在内的凸集
通俗的来说凸包就是包围一组散点的最小凸边形,相对的也有凹边形

ConvexHull能够帮助我们创建N维凸包

重要参数

points:浮点数组成的n维数组,结构为(点的个数,维度)。表示用来构成凸包的坐标点。
incremental:布尔值,可不填。允许不断向类中添加新的数据点。

重要属性

vertices:组成凸包的那些数据点在原数据中的索引

更多参数和属性:https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.spatial.ConvexHull.html

  1. from scipy.spatial import ConvexHull
  2. from matplotlib import patches # 给现有图像打补丁的包,即在现有的图像上增加更多的图形
  1. plt.figure(figsize = (5, 3))
  2. plt.scatter(x1, y1)
  3. plt.scatter(x2, y2)
  4. # 定义绘制凸包的函数
  5. def encircle(x, y, ax = None, **kw): # ax:子图
  6. ax = plt.gca() # get current ax
  7. p = np.c_[x, y] # 类似于zip函数,将两组数据组合起来。不同的是,zip将两两打包成元组,c_将两两打包成array
  8. hull = ConvexHull(p) # 将坐标转换成凸包对象。此对象不可打开,只能绘制在图像上,或通过vertices属性显示坐标在原数据中的索引
  9. poly = plt.Polygon(p[hull.vertices, :], **kw)
  10. # 使用属性vertices调用形成凸包的点的索引,进行切片后,利用绘制多边形的类plt.Polygon将形成凸包的点连起来
  11. # 这里的**kw就是定义函数的时候输入的**kw,里面包含了一系列可以在绘制多边形的类中进行调节的内容,包括多边形的边框颜色,填充颜色,透明度等等
  12. ax.add_patch(poly)
  13. encircle(x1, y1
  14. , ec = 'k'
  15. , fc = 'gold'
  16. , alpha = 0.2
  17. )
  18. encircle(x2, y2, ec="orange", fc="none")

output_28_0.png

凸包的作用

在计算机视觉技术当中,我们经常需要利用凸包,以帮助计算机识别图像的轮廓
凸.png

  1. #准备标签列表
  2. categories = np.unique(midwest['category'])
  3. colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))]
  4. plt.figure(figsize = (16, 10))
  5. for i in range(len(categories)):
  6. plt.scatter(midwest.loc[midwest['category'] == categories[i], 'area'],
  7. midwest.loc[midwest['category'] == categories[i], 'poptotal'],
  8. s = midwest.loc[midwest['category'] == categories[i], 'percasian'] * 500,
  9. color = plt.cm.tab10(i/float(len(categories) + 1)),
  10. label = categories[i])
  11. plt.legend(markerscale = 0.6)
  12. # 定义需要被框起来的数据:所有在IN州中的城市
  13. midwest_encircle_data = midwest.loc[midwest['state'] == 'IN', :]
  14. # 使用函数绘制
  15. # 绘制透明的金色的面
  16. encircle(midwest_encircle_data.area,
  17. midwest_encircle_data.poptotal,
  18. ec = 'k',
  19. fc = 'gold',
  20. alpha = 0.1)
  21. # 绘制不透明的浅蓝色的边
  22. encircle(midwest_encircle_data.area,
  23. midwest_encircle_data.poptotal,
  24. ec = 'lightblue',
  25. fc = 'none',
  26. linewidth = 1.5)

output_30_0.png

对比凸包在计算机视觉中的应用,在该气泡图中的应用可获取的信息十分有限

  1. IN州各城市占地面积都不大
  2. 人口则各种程度的都有
  3. 由于覆盖的范围过大,不够集中,无法获得有针对性的,有价值的信息