Numpy

1. 逻辑操作

  1. np.logical_and()
  2. 或 np.logical_or()
  3. 异或 np.logical_xor()
  4. 非 np.logical_not() ```python

    Create arrays

    import numpy as np my_house = np.array([18.0, 20.0, 10.75, 9.50]) your_house = np.array([14.0, 24.0, 14.25, 9.0])

my_house greater than 18.5 or smaller than 10

print(np.logical_or(my_house > 18.5, your_house < 10))

Both my_house and your_house smaller than 11

print(np.logical_and(my_house <11, your_house < 11))

  1. <a name="lc5wL"></a>
  2. # Pandas
  3. <a name="2bMAY"></a>
  4. ## 1. import file
  5. ```python
  6. import pandas as pd
  7. data_df = pd.read_csv('data.csv', index_col=0) # 第一列作为索引列

2. DataFrame and Series

DataFrame,可以存储不同数据类型
Series 类似于 narray,一列中只能存在一种数据类型

  1. # Import cars data
  2. import pandas as pd
  3. cars = pd.read_csv('cars.csv', index_col = 0)
  4. # Print out country column as Pandas Series
  5. print(cars['country']) # output Series
  6. # Print out country column as Pandas DataFrame
  7. print(cars[['country']]) # output DataFrame
  8. # Print out DataFrame with country and drives_right columns
  9. print(cars[['country','drives_right']])

3.loc and iloc

  • loc 根据索引内容查找
  • iloc 根据 row 和 col 的位置进行索引 ```python

    行索引查找

    Print out observation for Japan

    print(cars.loc[[‘JPN’]]) # 根据index内容索引 print(cars.iloc[[2]]) # 根据index位置索引

Print out observations for Australia and Egypt

print(cars.loc[[‘AUS’,’EG’]]) print(cars.iloc[[1,6]])

行列索引查找

Import cars data

import pandas as pd cars = pd.read_csv(‘cars.csv’, index_col = 0)

Print out drives_right value of Morocco

print(cars.loc[‘MOR’,’drives_right’])

Print sub-DataFrame

print(cars.loc[[‘RU’,’MOR’],[‘country’,’drives_right’]])

  1. <a name="nvvXs"></a>
  2. ## 4. DataFrame数据筛选
  3. ```python
  4. # Import cars data
  5. import pandas as pd
  6. cars = pd.read_csv('cars.csv', index_col = 0)
  7. # Convert code to a one-liner
  8. sel = cars[cars['drives_right']] # Series 数据类型, value为True和false
  9. # Print sel
  10. print(sel)

遍历DataFrame字典

  1. # Import numpy as np
  2. import numpy as np
  3. # Definition of dictionary
  4. europe = {'spain':'madrid', 'france':'paris', 'germany':'berlin',
  5. 'norway':'oslo', 'italy':'rome', 'poland':'warsaw', 'austria':'vienna' }
  6. # Iterate over europe
  7. for key, value in europe.items():
  8. print("the capital of " + key + " is " + value)
  9. # For loop over np_baseball
  10. for i in np.nditer(np_baseball): # np.nditer 迭代列数组
  11. print(i)
  12. # Iterate over rows of cars
  13. for lab, row in cars.iterrows(): # 遍历行数据
  14. print(lab) # 索引
  15. print(row) # 行数据

5. apply()应用

  1. # Import cars data
  2. import pandas as pd
  3. cars = pd.read_csv('cars.csv', index_col = 0)
  4. # Use .apply(str.upper)
  5. cars['COUNTRY'] = cars['country'].apply(str.upper) # 直接调用函数
  6. print(cars)

6. Random

生成随机数

  1. np.random.seed()
  2. np.random.rand()
  3. np.random.randint() ```python

    Import numpy as np

    import numpy as np

Set the seed

np.random.seed(123) # 设置种子参数,每次生成的随机数是一样的

Generate and print random float

np.random.rand() # 不设置参数,默认生成float

Use randint() to simulate a dice

print(np.random.randint(1,7)) # 生成整数,必须设置参数

  1. <a name="nvkDV"></a>
  2. # Seaborn
  3. 底层 matplotlib,高层 seaborn
  4. ```python
  5. # Import Matplotlib and Seaborn
  6. import matplotlib.pyplot as plt
  7. import seaborn as sns
  8. # Create scatter plot with GDP on the x-axis and number of phones on the y-axis
  9. sns.scatterplot(x=gdp, y=phones) # 散点图
  10. # Create count plot with region on the y-axis
  11. sns.countplot(y=region) # 柱状图
  12. # Show plot
  13. plt.show()

1. sns.scatterplot()

  • x轴、y轴
  • hue 标签名
  • hue_order 标签顺序
  • palette 标签颜色 ```python

    Import Matplotlib and Seaborn

    import matplotlib.pyplot as plt import seaborn as sns

Create a dictionary mapping subgroup values to colors

palette_colors = {“Rural”: “green”, “Urban”: “blue”}

Change the legend order in the scatter plot 散点图

sns.scatterplot(x=”absences”, y=”G3”, data=student_data, hue=”location”, hue_order=[‘Rural’,’Urban’], palette=palette_colors)

Show plot

plt.show()

  1. <a name="PfIRx"></a>
  2. ## 2. sns.relplot() 任意类型
  3. <a name="qfNXT"></a>
  4. ### 散点图
  5. ```python
  6. # Import Matplotlib and Seaborn
  7. import matplotlib.pyplot as plt
  8. import seaborn as sns
  9. # 散点图
  10. # Adjust further to add subplots based on family support
  11. sns.relplot(x="G1", y="G3",
  12. data=student_data,
  13. kind="scatter", # 散点图类型
  14. col="schoolsup", # 将列划分
  15. col_order=["yes", "no"], # 列的顺序
  16. row='famsup', # 将行划分
  17. row_order=['yes','no'])
  18. # Show plot
  19. plt.show()
  20. # Create scatter plot of horsepower vs. mpg
  21. sns.relplot(x="horsepower", y="mpg",
  22. data=mpg,
  23. kind="scatter",
  24. size="cylinders", # 点状大小区分
  25. style='origin', # 线条样式区分
  26. hue='cylinders') # 颜色区分
  27. # Show plot
  28. plt.show()

折线图

  1. # 折线图
  2. # Add markers and make each line have the same style
  3. sns.relplot(x="model_year", y="horsepower",
  4. data=mpg,
  5. kind="line",
  6. ci='bi', # confidence interval 置信区间
  7. style="origin",
  8. hue="origin",
  9. markers=True) # 显示标记点
  10. # Show plot
  11. plt.show()

柱状图 counterplot() and catplot()

  1. # Create a bar plot of interest in math, separated by gender
  2. sns.catplot(x="study_time", y="G3",
  3. data=student_data,
  4. kind="bar", # 条形图
  5. order=["<2 hours",
  6. "2 to 5 hours",
  7. "5 to 10 hours",
  8. ">10 hours"], # 显示顺序
  9. ci=None) # 取消置信区间
  10. # Show plot
  11. plt.show()

箱型图 box plot

  1. # Create a box plot with subgroups and omit the outliers
  2. sns.catplot(x='internet',y='G3',
  3. kind='box',
  4. data=student_data,
  5. hue='location',
  6. sym='',
  7. whis=[0, 100]) # 确定离群值的上下界(IQR超过低和高四分位数的比例)
  8. # Show plot
  9. plt.show()

点状图 point plot()

区别于lineplot(),对非连续的 category 进行统计

  1. # Remove the lines joining the points
  2. sns.catplot(x="famrel", y="absences",
  3. data=student_data,
  4. kind="point",
  5. capsize=0.2, # 置信区间大小
  6. join=False) # 去除连接线
  7. # Show plot
  8. plt.show()

3. 自定义样式

  1. # Set the style to "whitegrid"
  2. sns.set_style('whitegrid') # 图表背景
  3. sns.set_palette("Purples") # 柱状图颜色
  4. # sns.set_palette("RdBu")
  5. # sns.set_context("text") 最小
  6. # sns.set_context("paper") 其次
  7. sns.set_context("poster") # 设置文本大小,最大
  8. # Create a count plot of survey responses
  9. category_order = ["Never", "Rarely", "Sometimes",
  10. "Often", "Always"]
  11. sns.catplot(x="Parents Advice",
  12. data=survey_data,
  13. kind="count",
  14. order=category_order)
  15. # title
  16. g.fig.suptitle('Car Weight vs. Horsepower')
  17. # 线型图的标题添加
  18. # Add a title "Average MPG Over Time"
  19. g.set_title("Average MPG Over Time")
  20. # Add x-axis and y-axis labels
  21. g.set(xlabel='Car Model Year',ylabel='Average MPG')
  22. # 旋转图形
  23. # Rotate x-tick labels
  24. plt.xticks(rotation=90)
  25. # Show plot
  26. plt.show()

image.svg