创建虚拟数据

faker库
image.png

  1. import pandas as pd
  2. from faker import Faker
  3. import random as rd
  4. fake = Faker("zh-CN")
  5. # 要输入到DF中的数据字典
  6. dic = {}
  7. # 姓名
  8. list_name = []
  9. # 随机手机号
  10. list_phone_number = []
  11. # 随机国家
  12. list_country = []
  13. # 随机邮件
  14. list_email = []
  15. # 随机车牌号
  16. list_car_number = []
  17. # 随机年份,格式2022-02-22
  18. list_date = []
  19. # 随机字符串,长度在min-max之间
  20. list_str =[]
  21. # 随机年龄
  22. list_age = []
  23. # 随机金额
  24. list_money = []
  25. # 索引
  26. list_index = []
  27. for i in range(0,1000):
  28. list_index.append(i)
  29. list_name.append(fake.name())
  30. list_phone_number.append(fake.phone_number())
  31. list_country.append(fake.country())
  32. list_email.append(fake.email())
  33. list_car_number.append(fake.license_plate())
  34. list_date.append(fake.date_time(tzinfo=None, end_datetime=None))
  35. list_str.append(fake.pystr(min_chars=0, max_chars=8))
  36. list_age.append(rd.randint(15,65))
  37. list_money.append(rd.random()*9999)
  38. dic['Index'] = list_index
  39. dic['Name'] = list_name
  40. dic['Age'] = list_age
  41. dic['Phone'] = list_phone_number
  42. dic['Country'] = list_country
  43. dic['Email'] = list_email
  44. dic['CarNumber'] = list_car_number
  45. dic['Date'] = list_date
  46. dic['Note'] = list_str
  47. dic['Money'] = list_money
  48. df = pd.DataFrame(dic)
  49. print(df.columns)
  50. df.to_excel('./TestExcel/AlbertData.xlsx')

创建文件

  1. import pandas as pd
  2. # DataFrame-->Worksheet
  3. # 初始化数据
  4. df = pd.DataFrame({'ID': [1, 2, 3, 4, 5, 6], 'Name': ['Tim', 'Victor', 'Nick', 'Timthony', 'Yangzhongke', 'Zack']})
  5. # 设置ID为索引,而不是默认的ID 0 1 2
  6. df = df.set_index('ID')
  7. print(df.columns)
  8. print(df)
  9. excelPath = './TestExcel/output.xlsx' # 使用相对路径
  10. df.to_excel(excelPath)
  11. print('Create excel successfully!')

读取文件

  1. import pandas as pd
  2. excelPath = './TestExcel/output.xlsx' # 使用相对路径
  3. outExcel = pd.read_excel(excelPath, index_col='ID')
  4. # outExcel = pd.read_excel(excelPath, header=1) # 从第1行开始作为标题
  5. # 如果excel里面没有header,但是我们知道如何操作?下面两行
  6. # 操作将Name列显示为了SubName
  7. outExcel = pd.read_excel(excelPath, header=None)
  8. outExcel.columns = ['ID', 'SubName']
  9. # 将修改好的数据SubName写到新的文件中
  10. outExcel.set_index('ID', inplace=True)
  11. outExcel.to_excel('./TestExcel/output002.xlsx')
  12. print('Done')
  13. print(outExcel.shape) # 显示出这个表有几行几列
  14. print(outExcel.columns) # 显示出所有列名称
  15. print(f'我是文件头两行数据:\n{outExcel.head(2)}') # 查看前2行
  16. print(f'我是文件尾三行数据:\n{outExcel.tail(3)}') # 后三行

行,列,单元格

  1. import pandas as pd
  2. # dic = {'x': 100, 'y': 200, 'z': 300}
  3. # s1 = pd.Series(dic)
  4. # 下面这种方式是一样的效果
  5. # index是行号 name是列号 [100,200,300]是行内容
  6. # s1 = pd.Series([100, 200, 300], index=['x', 'y', 'z'])
  7. s1 = pd.Series([1, 2, 3], index=[1, 2, 3], name='A') # 一行数据
  8. s2 = pd.Series([10, 20, 30], index=[1, 2, 3], name='B')
  9. s3 = pd.Series([100, 200, 300], index=[2, 3, 4], name='C')
  10. # 以Dictionary的形式将Series加入到DataFrame中
  11. df = pd.DataFrame({s1.name: s1, s2.name: s2, s3.name: s3})
  12. # df = df.set_index('A')
  13. # 假如Series index是[2,3,4] 没有的则会填充NaN
  14. print(df.head())

数据区域的读取,填充数字,文字序列

  1. import pandas as pd
  2. from datetime import date, timedelta
  3. # 实现月份累加算法,d是初始时间,md是累加的月份
  4. def add_month(d, md):
  5. yd = md // 12
  6. m = d.month + md % 12
  7. if m != 12:
  8. yd += m // 12
  9. m = m % 12
  10. return date(d.year + yd, m, d.day)
  11. # 跳过空行16行,选取E:H列 指定ID列的数据类型为int(先转为string类型,直接转int类型不行)
  12. books = pd.read_excel('./TestExcel/output004.xlsx', skiprows=16, usecols="E:H",
  13. dtype={'ID': str, 'InStore': str, 'Date': str})
  14. startTime = date(2021, 12, 25)
  15. for i in books.index:
  16. books['ID'].at[i] = i + 1
  17. print(type(books['ID'])) # 此种写法books['ID']是pandas.core.series.Series对象
  18. books['InStore'].at[i] = 'Yes' if i % 2 == 0 else 'NO' # 在Python中if else可以组成表达式用
  19. # books['Date'].at[i] = startTime + timedelta(days=i) # 天数依次累加
  20. # books['Date'].at[i] = date(startTime.year+i,startTime.month,startTime.day) # 年份依次累加
  21. books['Date'].at[i] = add_month(startTime, i) # 月份依次累加
  22. # print(books['ID'])
  23. books.to_excel('./TestExcel/output004Test.xlsx')
  24. print(books)

填充日期序列

  1. import pandas as pd
  2. from datetime import date, timedelta
  3. # 实现月份累加算法,d是初始时间,md是累加的月份
  4. def add_month(d, md):
  5. yd = md // 12
  6. m = d.month + md % 12
  7. if m != 12:
  8. yd += m // 12
  9. m = m % 12
  10. return date(d.year + yd, m, d.day)
  11. # 跳过空行16行,选取E:H列 指定ID列的数据类型为int(先转为string类型,直接转int类型不行)
  12. books = pd.read_excel('./TestExcel/output004.xlsx', skiprows=16, usecols="E:H",
  13. dtype={'ID': str, 'InStore': str, 'Date': str})
  14. startTime = date(2021, 12, 25)
  15. for i in books.index:
  16. books['ID'].at[i] = i + 1
  17. print(type(books['ID'])) # 此种写法books['ID']是pandas.core.series.Series对象
  18. books['InStore'].at[i] = 'Yes' if i % 2 == 0 else 'NO' # 在Python中if else可以组成表达式用
  19. # books['Date'].at[i] = startTime + timedelta(days=i) # 天数依次累加
  20. # books['Date'].at[i] = date(startTime.year+i,startTime.month,startTime.day) # 年份依次累加
  21. books['Date'].at[i] = add_month(startTime, i) # 月份依次累加
  22. # print(books['ID'])
  23. books.to_excel('./TestExcel/output004Test.xlsx')
  24. print(books)

函数填充,计算列

  1. import pandas as pd
  2. def add_2(x):
  3. return x + 2
  4. books = pd.read_excel('./TestExcel/output005.xlsx', dtype={'ID': str, 'Name': str},index_col='ID')
  5. for i in books.index:
  6. books['FirstPrice'].at[i] = 20 + i;
  7. books['LastPrice'].at[i] = 30 + i;
  8. # 列计算无需取出每行,直接列运算即可
  9. books['FinalPrice'] = books['FirstPrice'] * books['LastPrice']
  10. # 自己列加2块钱
  11. books['FinalPrice'] += 2
  12. # 另一种方式追加2块钱,函数
  13. books['FinalPrice'] = books['FinalPrice'].apply(add_2)
  14. # 另一种方式追加2块钱,lambda表达式
  15. books['FinalPrice'] = books['FinalPrice'].apply(lambda x: x + 2)
  16. print(books)
  17. books.to_excel('./TestExcel/output005.xlsx')

排序

  1. import pandas as pd
  2. books = pd.read_excel('./TestExcel/output006.xlsx', dtype={'ID': str, 'Name': str}, index_col='ID')
  3. # inplace=True是为了不生产新的DF
  4. # 按照价格排序 ascending默认为True升序排列
  5. # books.sort_values(by='FinalPrice', inplace=True, ascending=False)
  6. # 先按照Worthly排序,再按照FinalPrice排序
  7. books.sort_values(by=['Worthly', 'FinalPrice'], inplace=True, ascending=[True,False])
  8. print(books)

数据筛选、过滤

  1. import pandas as pd
  2. def lastPrice_32_to_42(a):
  3. return a >= 32 and a < 42
  4. def finalPrice_657_to_2000(a):
  5. return 657 <= a < 2000
  6. books = pd.read_excel('./TestExcel/output007.xlsx', index_col='ID')
  7. # 这边loc定位到某列,按照apply里面的函数关系来进行筛选
  8. # books = books.loc[books['LastPrice'].apply(lastPrice_32_to_42)].loc[books['FinalPrice'].apply(finalPrice_657_to_2000)]
  9. # 可以直接.列名来修改
  10. books = books.loc[books.LastPrice.apply(lastPrice_32_to_42)].loc[books.FinalPrice.apply(finalPrice_657_to_2000)]
  11. books.sort_values(by=['Worthly', 'FinalPrice'], inplace=True, ascending=[True, False])
  12. print(books)

柱状图

Figure_1.png

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. # 绘图,不要添加index_col='Field'索引列
  4. books = pd.read_excel('./TestExcel/output008.xlsx')
  5. print(books.head(5))
  6. # 按照Number列排序 降序 替换原先的DF
  7. books.sort_values(by='Number', ascending=False, inplace=True)
  8. # 使用pandas绘图:横坐标为Field 纵坐标为Number 颜色橘色 标题xxx
  9. # books.plot.bar(x='Field', y='Number',color='orange',title='InterNationalBooks')
  10. # 使用plt绘图
  11. plt.bar(books.Field, books.Number, color='orange')
  12. # 旋转标签90
  13. plt.xticks(books.Field, rotation=90)
  14. plt.xlabel('Field')
  15. plt.ylabel('Number')
  16. plt.title('InternationalBooks', fontsize=16)
  17. # 紧凑型绘图,能够将横坐标全部显示出来
  18. plt.tight_layout()
  19. plt.show()

分组柱图,深度优化

Figure_2.png

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. books = pd.read_excel('./TestExcel/output009.xlsx')
  4. print(books)
  5. books.sort_values(by=2021, inplace=True, ascending=False)
  6. books.plot.bar(x='Field', y=[2020, 2021],color=['orange','blue'])
  7. plt.title('National Books', fontsize=16, fontweight='bold')
  8. plt.xlabel('Field', fontweight='bold')
  9. plt.ylabel('Number', fontweight='bold')
  10. # 自定义X轴样式:斜45° 文字最末端对齐
  11. ax = plt.gca()
  12. ax.set_xticklabels(books.Field, rotation=45, ha='right')
  13. # 将图片左边和底部固定
  14. figures = plt.gcf()
  15. figures.subplots_adjust(left=0.2,bottom=0.42)
  16. # plt.tight_layout()
  17. plt.show()

叠加柱状图,水平柱状图

image.png

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. users = pd.read_excel('./TestExcel/output010.xlsx')
  4. users['Total'] = users.Oct+users.Nov+users.Dec
  5. users.sort_values(by='Total',ascending=False,inplace=True)
  6. print(users)
  7. # 分组柱状图-->叠加柱状图stacked
  8. users.plot.bar(x='Name',y=['Oct','Nov','Dec'],stacked=True)
  9. plt.title('User Behavior',fontsize=24,fontweight='bold')
  10. # 旋转X轴文字45°
  11. ax = plt.gca()
  12. ax.set_xticklabels(users.Name,rotation=45,ha='right')
  13. plt.tight_layout()
  14. # 水平柱状图
  15. users.sort_values(by='Total',ascending=True,inplace=True)
  16. users.plot.barh(x='Name',y=['Oct','Nov','Dec'],stacked=True)
  17. plt.tight_layout()
  18. plt.show()

饼图

image.png

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. students = pd.read_excel('./TestExcel/output011.xlsx',index_col='From')
  4. print(students)
  5. # 绘制饼图,默认情况下以index_col作为图例显示饼图
  6. # 排序以顺时针形式呈现,并将开始位置设置为-250
  7. # students['2017'].sort_values().plot.pie(startangle=-250)
  8. # 第二种方式counterclock=False顺时针方式呈现
  9. students['2017'].plot.pie(counterclock=False)
  10. plt.title('Source of International Students')
  11. plt.ylabel('2017',fontweight='bold')
  12. plt.show()

折线趋势图,叠加区域图

image.png

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. weeks = pd.read_excel('./TestExcel/output012.xlsx',index_col='Week')
  4. print(weeks)
  5. print(weeks.columns)
  6. # 折线图
  7. weeks.plot(y=['Accessories', 'Bikes', 'Clothing', 'Components', 'Grand Total'])
  8. plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
  9. plt.ylabel('Total',fontsize=12,fontweight='bold')
  10. plt.xticks(weeks.index)
  11. # 叠加区域图
  12. weeks.plot.area(y=['Accessories', 'Bikes', 'Clothing', 'Components', 'Grand Total'])
  13. plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
  14. plt.ylabel('Total',fontsize=12,fontweight='bold')
  15. plt.xticks(weeks.index)
  16. plt.show()

散点图、直方图、密度图、数据相关性

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. # 显示所有列
  4. pd.options.display.max_columns =777
  5. homes = pd.read_excel('./TestExcel/output013.xlsx')
  6. # print(homes.head())
  7. # 散点图scatter
  8. #homes.plot.scatter(x='sqft_living',y='price')
  9. # 直方图hist 分布区间bins
  10. # homes.sqft_living.plot.hist(bins=100)
  11. # plt.xticks(range(0,max(homes.sqft_living),5000),fontsize=8,rotation=90)
  12. # 密度图kde
  13. homes.sqft_living.plot.kde()
  14. plt.xticks(range(0,max(homes.sqft_living),5000),fontsize=8,rotation=90)
  15. plt.show()
  16. # 数据相关性分析 column row relation
  17. print(homes.corr())

多表联合(从VLOOKUP到JOIN)

连接了两张表,一张学生姓名表,一张分数表
image.pngimage.png
image.png

  1. from typing import SupportsRound
  2. import pandas as pd
  3. students = pd.read_excel('./TestExcel/output014.xlsx',sheet_name='Students',index_col='ID')
  4. scores = pd.read_excel('./TestExcel/output014.xlsx',sheet_name='Scores',index_col='ID')
  5. # students表join scores表 how表示不论是否能查询到数据始终保持students表的数据 以ID列来join
  6. # fillna 用0来填充空值
  7. # 如果左右两边ID列名字不一样 left_on right_on
  8. # table = students.merge(scores,how='left',left_on=students.index, right_on=scores.index).fillna(0)
  9. table = students.join(scores,how='left').fillna(0)
  10. # 将浮点数转为原始的int数据
  11. table.Score = table.Score.astype(int)
  12. print(table)

数据校验、轴的概念

  1. import pandas as pd
  2. def validate_score(x):
  3. """[校验函数]
  4. Args:
  5. x ([object]): [学生信息]
  6. """
  7. try:
  8. assert 0<=x.Score<=100
  9. except:
  10. print(f'#{x.ID}\tStudent {x.Name} has an invalid score {x.Score}')
  11. #第二种写法
  12. #if not 0<=x.Score<=100:
  13. #print(f'#{x.ID}\tStudent {x.Name} has an invalid score {x.Score}')
  14. students = pd.read_excel('./TestExcel/output015.xlsx')
  15. # axis默认为0,从上到下 1,从左到右进行校验
  16. students.apply(validate_score,axis=1)
  17. # print(students)

把一列数据分割成两列

  1. import pandas as pd
  2. Students = pd.read_excel('./TestExcel/output015.xlsx')
  3. # 一列数据分割成两列expand=True n=0切出来多少子字符串就保留多少
  4. df = Students.Name.str.split('_',expand=True)
  5. Students['StudentTitle'] = df[0].str.upper()
  6. Students['StudentIndex'] = df[1]
  7. print(Students)

求和、求平均、统计引导

image.png

  1. import pandas as pd
  2. house_price = pd.read_excel('./TestExcel/output013.xlsx')
  3. # DataFrame.sum()
  4. # axis 1横轴 0纵轴 sum求和 mean求平均
  5. temp_total = house_price[['bedrooms','bathrooms']].sum(axis=1)
  6. temp_average = house_price[['bedrooms','bathrooms']].mean(axis=1)
  7. house_price['TotalRooms'] = temp_total
  8. house_price['AverageRooms'] = temp_average
  9. # house_price['AverageRooms'] .astype(int)
  10. # 追加列方向的累加
  11. col_sum = house_price[['ID','price','bedrooms','bathrooms','sqft_living','sqft_basement']].sum()
  12. col_sum['TotalRooms'] = 'Summary'
  13. # 将自动按照col_sum的列名来进行追加
  14. house_price = house_price.append(col_sum,ignore_index=True)
  15. print(house_price)

定位、消除重复数据

image.png

  1. from os import dup
  2. import pandas as pd
  3. students = pd.read_excel('./TestExcel/output009.xlsx')
  4. row_sum = students[['Firstyear','Lastyear']].sum(axis=1)
  5. students['TotalSales'] = row_sum
  6. col_sum = students[['Firstyear','Lastyear','TotalSales']].sum()
  7. col_sum['Field'] = 'Summary'
  8. students = students.append(col_sum,ignore_index=True)
  9. print(students)
  10. print('\n')
  11. # 取出那些重复的数据
  12. dupe = students.duplicated(subset='Field')
  13. print(dupe.any())
  14. print(dupe.count())
  15. print(dupe[dupe]) #自己和True比较
  16. dupe = dupe[dupe==True]
  17. print(f'{dupe.index}\n')
  18. print(f'{students.iloc[dupe.index]}\n') #取出那些数据
  19. # 消除重复数据,基于哪些列 subset=['Field',Firstyear'] keep='first'/'last'保留开始还是后面的
  20. students.drop_duplicates(subset='Field',inplace=True,keep='first')
  21. print(students)

旋转数据表(行转换)

image.png

  1. import pandas as pd
  2. # 显示最大列
  3. pd.options.display.max_columns = 999
  4. videos = pd.read_excel('./TestExcel/output016.xlsx',index_col='Month')
  5. # 旋转表格,要将index_col设置为旋转坐标列Month,不然就会以默认index来作为列名
  6. videos = videos.transpose()
  7. print(videos)

读取CVS、TSV、TXT文件中的数据

  1. import pandas as pd
  2. students1 = pd.read_csv('./TestOtherFiles/Students.csv')
  3. print(students1)
  4. students2 = pd.read_csv('./TestOtherFiles/Students.txt',sep='|')
  5. print(students2)

透视表、分组、聚合(GroupBy)

  1. import pandas as pd
  2. import numpy as np
  3. pd.options.display.max_columns = 777
  4. orders = pd.read_excel('./TestExcel/output017.xlsx')
  5. orders['Year'] = pd.DatetimeIndex(orders.Date).year
  6. print(orders)
  7. piovot_table1=orders.pivot_table(index='Category',columns='Year',values='Total',aggfunc=np.sum)
  8. print(piovot_table1)
  9. groups = orders.groupby(['Category','Year'])
  10. s = groups['Total'].sum()
  11. c = groups['ID'].count()
  12. piovot_table2 = pd.DataFrame({'Sum':s,'Count':c})
  13. print(piovot_table2)

线性回归、数据预测

数据分析和机器学习的边界

  1. import pandas as pd
  2. import matplotlib.pyplot as plt
  3. from scipy.stats import linregress
  4. def validate_revenue(series):
  5. if not 5<series.Revenue<20:
  6. print(f'{series}值不在范围内')
  7. sales = pd.read_excel('./TestExcel/output018.xlsx',dtype={'Month':str})
  8. # 斜率,截距
  9. slope,intercept,r,p,std_err = linregress(sales.index,sales.Revenue)
  10. # 线性回归方程
  11. exp = sales.index*slope+intercept
  12. # sales.plot.area(y='Revenue')
  13. # 散点图
  14. plt.scatter(x=sales.index,y=sales['Revenue'],color='orange')
  15. plt.plot(sales.index,exp,color='blue')
  16. #plt.bar(sales.index,sales.Revenue)
  17. ax = plt.gca()
  18. ax.set_xticklabels(sales['Month'],rotation=45,ha='right')
  19. plt.title(f'y={slope}*X+{intercept}')
  20. plt.tight_layout()
  21. plt.show()
  22. # 横向验证,传入函数的是series
  23. #sales.apply(validate_revenue,axis=1)
  24. #print(sales.Month.dtype)

条件格式化

anacoda input: jupyter notebook
image.png

  1. import pandas as pd
  2. def low_score_red(s):
  3. color = 'red' if s<21000 else 'black'
  4. return f'color:{color}'
  5. def highest_score_green(col):
  6. #把字体编程绿色,底层变为黄绿色
  7. return ['background-color:lime;color:green' if s==col.max() else 'background-color:white' for s in col]
  8. students = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx')
  9. students.style.applymap(low_score_red,subset=['2016','2017'])\
  10. .apply(highest_score_green,subset=['2016','2017'])

image.png

  1. import pandas as pd
  2. import seaborn as sns
  3. color_map = sns.light_palette('green',as_cmap=True)
  4. students = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx')
  5. # 将颜色深度应用到数据列上
  6. students.style.background_gradient(color_map,subset=['2016','2017'])

image.png

  1. import pandas as pd
  2. #import seaborn as sns
  3. #color_map = sns.light_palette('green',as_cmap=True)
  4. students = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx')
  5. # 将颜色深度应用到数据列上
  6. # students.style.background_gradient(color_map,subset=['2016','2017'])
  7. students.style.bar(color='orange',subset=['2016','2017'])

行操作集锦

  1. import pandas as pd
  2. students1 = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx',sheet_name='Sheet1')
  3. students2 = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx',sheet_name='Sheet2')
  4. # 两个Sheet合并为一个,重置index_col drop=True放弃原来的
  5. student_total = students1.append(students2).reset_index(drop=True)
  6. # 追加手动创建的一行
  7. stu = pd.Series({'Rank':100,'From':'USA','2016':90,'2017':100})
  8. student_total=student_total.append(stu,ignore_index=True)
  9. # 更改其中一行的值
  10. student_total['From'].at[13] = 'Mexico3'
  11. student_total.at[12,'From'] = 'Japan3'
  12. # 插入一行 切片技术
  13. stu = pd.Series({'Rank':101,'From':'USB','2016':90,'2017':100})
  14. # 左闭右开
  15. part1 = student_total[:10]
  16. part2 = student_total[10:]
  17. student_total = part1.append(stu,ignore_index=True).append(part2).reset_index(drop=True)
  18. # 删除数据行
  19. # student_total.drop(index=[0,1,2],inplace=True)
  20. # student_total.drop(index=range(4,6),inplace=True)
  21. for i in range(5,8):
  22. student_total['From'].at[i] =''
  23. missing_data = student_total.loc[student_total['From']=='']
  24. student_total.drop(index=missing_data.index,inplace=True) #切片的方式传入
  25. student_total.reset_index(drop=True) # 重新铺一下index
  26. print(student_total)

列操作集锦

  1. import pandas as pd
  2. import numpy as np
  3. students = pd.read_excel('./TestExcel/output014.xlsx',sheet_name='Students')
  4. scores = pd.read_excel('./TestExcel/output014.xlsx',sheet_name='Scores')
  5. # 列横向拼接
  6. list_total = pd.concat([students,scores],axis=1).reset_index(drop=True)
  7. print(list_total)
  8. # 行追加第二种写法
  9. list_total2 = pd.concat([students,scores],ignore_index=True).reset_index(drop=True)
  10. print(list_total2)
  11. # 将ID相同Join进来
  12. list_total3 = students.merge(scores,how='left',on=['ID','ID'])
  13. # 追加一列:
  14. list_total3['Age'] = 25
  15. list_total3['RealAge'] = np.arange(0,len(list_total3))
  16. print(f'Total3:\n{list_total3}\n')
  17. print(np.arange(0,len(list_total3)))
  18. # 删除列
  19. list_total3.drop(columns=['Age','RealAge'],inplace=True)
  20. print(f'DropTotal3:\n{list_total3}\n')
  21. # 插入列
  22. list_total3.insert(1,column='Foo',value=26)
  23. print(f'DropTotal3:\n{list_total3}\n')
  24. # 列名大写
  25. list_total3.rename(columns={'Foo':'FOO'},inplace=True)
  26. print(f'Upper:\n{list_total3}\n')
  27. # 删除空行
  28. list_total3.dropna(inplace=True)
  29. print(f'Dropnan:\n{list_total3}\n')

读取数据库

image.png

  1. import pandas as pd
  2. import numpy as np
  3. import pyodbc as dbc
  4. # ORM框架 create_engine
  5. import sqlalchemy
  6. books = pd.read_excel('./TestExcel/output019.xlsx')
  7. print(books)
  8. # 此处注意Server={ServerName} ServerName未必是{local}
  9. connection = dbc.connect('Driver={SQL Server};'
  10. 'Server={DESKTOP-07OJ9VD};'
  11. 'Database=AlbertBook;'
  12. 'Trusted_Connection=yes;')
  13. query = 'Select Title from T_Books'
  14. df1 = pd.read_sql_query(query,connection)
  15. print(df1)

编写复杂方程

image.png

  1. import pandas as pd
  2. import numpy as np
  3. def cal_bike_complex_clothing(b,c):
  4. return b**2+c*np.pi
  5. sales = pd.read_excel('./TestExcel/output012.xlsx')
  6. sales['complex'] = sales.apply(lambda row:cal_bike_complex_clothing(row['Bikes'],row['Clothing']),axis=1)
  7. #将列名大写
  8. sales.rename(columns={'complex':'Complex'},inplace=True)
  9. sales.drop(columns={'Components'},inplace=True)
  10. print(sales)