创建虚拟数据
faker库
import pandas as pd
from faker import Faker
import random as rd
fake = Faker("zh-CN")
# 要输入到DF中的数据字典
dic = {}
# 姓名
list_name = []
# 随机手机号
list_phone_number = []
# 随机国家
list_country = []
# 随机邮件
list_email = []
# 随机车牌号
list_car_number = []
# 随机年份,格式2022-02-22
list_date = []
# 随机字符串,长度在min-max之间
list_str =[]
# 随机年龄
list_age = []
# 随机金额
list_money = []
# 索引
list_index = []
for i in range(0,1000):
list_index.append(i)
list_name.append(fake.name())
list_phone_number.append(fake.phone_number())
list_country.append(fake.country())
list_email.append(fake.email())
list_car_number.append(fake.license_plate())
list_date.append(fake.date_time(tzinfo=None, end_datetime=None))
list_str.append(fake.pystr(min_chars=0, max_chars=8))
list_age.append(rd.randint(15,65))
list_money.append(rd.random()*9999)
dic['Index'] = list_index
dic['Name'] = list_name
dic['Age'] = list_age
dic['Phone'] = list_phone_number
dic['Country'] = list_country
dic['Email'] = list_email
dic['CarNumber'] = list_car_number
dic['Date'] = list_date
dic['Note'] = list_str
dic['Money'] = list_money
df = pd.DataFrame(dic)
print(df.columns)
df.to_excel('./TestExcel/AlbertData.xlsx')
创建文件
import pandas as pd
# DataFrame-->Worksheet
# 初始化数据
df = pd.DataFrame({'ID': [1, 2, 3, 4, 5, 6], 'Name': ['Tim', 'Victor', 'Nick', 'Timthony', 'Yangzhongke', 'Zack']})
# 设置ID为索引,而不是默认的ID 0 1 2
df = df.set_index('ID')
print(df.columns)
print(df)
excelPath = './TestExcel/output.xlsx' # 使用相对路径
df.to_excel(excelPath)
print('Create excel successfully!')
读取文件
import pandas as pd
excelPath = './TestExcel/output.xlsx' # 使用相对路径
outExcel = pd.read_excel(excelPath, index_col='ID')
# outExcel = pd.read_excel(excelPath, header=1) # 从第1行开始作为标题
# 如果excel里面没有header,但是我们知道如何操作?下面两行
# 操作将Name列显示为了SubName
outExcel = pd.read_excel(excelPath, header=None)
outExcel.columns = ['ID', 'SubName']
# 将修改好的数据SubName写到新的文件中
outExcel.set_index('ID', inplace=True)
outExcel.to_excel('./TestExcel/output002.xlsx')
print('Done')
print(outExcel.shape) # 显示出这个表有几行几列
print(outExcel.columns) # 显示出所有列名称
print(f'我是文件头两行数据:\n{outExcel.head(2)}') # 查看前2行
print(f'我是文件尾三行数据:\n{outExcel.tail(3)}') # 后三行
行,列,单元格
import pandas as pd
# dic = {'x': 100, 'y': 200, 'z': 300}
# s1 = pd.Series(dic)
# 下面这种方式是一样的效果
# index是行号 name是列号 [100,200,300]是行内容
# s1 = pd.Series([100, 200, 300], index=['x', 'y', 'z'])
s1 = pd.Series([1, 2, 3], index=[1, 2, 3], name='A') # 一行数据
s2 = pd.Series([10, 20, 30], index=[1, 2, 3], name='B')
s3 = pd.Series([100, 200, 300], index=[2, 3, 4], name='C')
# 以Dictionary的形式将Series加入到DataFrame中
df = pd.DataFrame({s1.name: s1, s2.name: s2, s3.name: s3})
# df = df.set_index('A')
# 假如Series index是[2,3,4] 没有的则会填充NaN
print(df.head())
数据区域的读取,填充数字,文字序列
import pandas as pd
from datetime import date, timedelta
# 实现月份累加算法,d是初始时间,md是累加的月份
def add_month(d, md):
yd = md // 12
m = d.month + md % 12
if m != 12:
yd += m // 12
m = m % 12
return date(d.year + yd, m, d.day)
# 跳过空行16行,选取E:H列 指定ID列的数据类型为int(先转为string类型,直接转int类型不行)
books = pd.read_excel('./TestExcel/output004.xlsx', skiprows=16, usecols="E:H",
dtype={'ID': str, 'InStore': str, 'Date': str})
startTime = date(2021, 12, 25)
for i in books.index:
books['ID'].at[i] = i + 1
print(type(books['ID'])) # 此种写法books['ID']是pandas.core.series.Series对象
books['InStore'].at[i] = 'Yes' if i % 2 == 0 else 'NO' # 在Python中if else可以组成表达式用
# books['Date'].at[i] = startTime + timedelta(days=i) # 天数依次累加
# books['Date'].at[i] = date(startTime.year+i,startTime.month,startTime.day) # 年份依次累加
books['Date'].at[i] = add_month(startTime, i) # 月份依次累加
# print(books['ID'])
books.to_excel('./TestExcel/output004Test.xlsx')
print(books)
填充日期序列
import pandas as pd
from datetime import date, timedelta
# 实现月份累加算法,d是初始时间,md是累加的月份
def add_month(d, md):
yd = md // 12
m = d.month + md % 12
if m != 12:
yd += m // 12
m = m % 12
return date(d.year + yd, m, d.day)
# 跳过空行16行,选取E:H列 指定ID列的数据类型为int(先转为string类型,直接转int类型不行)
books = pd.read_excel('./TestExcel/output004.xlsx', skiprows=16, usecols="E:H",
dtype={'ID': str, 'InStore': str, 'Date': str})
startTime = date(2021, 12, 25)
for i in books.index:
books['ID'].at[i] = i + 1
print(type(books['ID'])) # 此种写法books['ID']是pandas.core.series.Series对象
books['InStore'].at[i] = 'Yes' if i % 2 == 0 else 'NO' # 在Python中if else可以组成表达式用
# books['Date'].at[i] = startTime + timedelta(days=i) # 天数依次累加
# books['Date'].at[i] = date(startTime.year+i,startTime.month,startTime.day) # 年份依次累加
books['Date'].at[i] = add_month(startTime, i) # 月份依次累加
# print(books['ID'])
books.to_excel('./TestExcel/output004Test.xlsx')
print(books)
函数填充,计算列
import pandas as pd
def add_2(x):
return x + 2
books = pd.read_excel('./TestExcel/output005.xlsx', dtype={'ID': str, 'Name': str},index_col='ID')
for i in books.index:
books['FirstPrice'].at[i] = 20 + i;
books['LastPrice'].at[i] = 30 + i;
# 列计算无需取出每行,直接列运算即可
books['FinalPrice'] = books['FirstPrice'] * books['LastPrice']
# 自己列加2块钱
books['FinalPrice'] += 2
# 另一种方式追加2块钱,函数
books['FinalPrice'] = books['FinalPrice'].apply(add_2)
# 另一种方式追加2块钱,lambda表达式
books['FinalPrice'] = books['FinalPrice'].apply(lambda x: x + 2)
print(books)
books.to_excel('./TestExcel/output005.xlsx')
排序
import pandas as pd
books = pd.read_excel('./TestExcel/output006.xlsx', dtype={'ID': str, 'Name': str}, index_col='ID')
# inplace=True是为了不生产新的DF
# 按照价格排序 ascending默认为True升序排列
# books.sort_values(by='FinalPrice', inplace=True, ascending=False)
# 先按照Worthly排序,再按照FinalPrice排序
books.sort_values(by=['Worthly', 'FinalPrice'], inplace=True, ascending=[True,False])
print(books)
数据筛选、过滤
import pandas as pd
def lastPrice_32_to_42(a):
return a >= 32 and a < 42
def finalPrice_657_to_2000(a):
return 657 <= a < 2000
books = pd.read_excel('./TestExcel/output007.xlsx', index_col='ID')
# 这边loc定位到某列,按照apply里面的函数关系来进行筛选
# books = books.loc[books['LastPrice'].apply(lastPrice_32_to_42)].loc[books['FinalPrice'].apply(finalPrice_657_to_2000)]
# 可以直接.列名来修改
books = books.loc[books.LastPrice.apply(lastPrice_32_to_42)].loc[books.FinalPrice.apply(finalPrice_657_to_2000)]
books.sort_values(by=['Worthly', 'FinalPrice'], inplace=True, ascending=[True, False])
print(books)
柱状图
import pandas as pd
import matplotlib.pyplot as plt
# 绘图,不要添加index_col='Field'索引列
books = pd.read_excel('./TestExcel/output008.xlsx')
print(books.head(5))
# 按照Number列排序 降序 替换原先的DF
books.sort_values(by='Number', ascending=False, inplace=True)
# 使用pandas绘图:横坐标为Field 纵坐标为Number 颜色橘色 标题xxx
# books.plot.bar(x='Field', y='Number',color='orange',title='InterNationalBooks')
# 使用plt绘图
plt.bar(books.Field, books.Number, color='orange')
# 旋转标签90
plt.xticks(books.Field, rotation=90)
plt.xlabel('Field')
plt.ylabel('Number')
plt.title('InternationalBooks', fontsize=16)
# 紧凑型绘图,能够将横坐标全部显示出来
plt.tight_layout()
plt.show()
分组柱图,深度优化
import pandas as pd
import matplotlib.pyplot as plt
books = pd.read_excel('./TestExcel/output009.xlsx')
print(books)
books.sort_values(by=2021, inplace=True, ascending=False)
books.plot.bar(x='Field', y=[2020, 2021],color=['orange','blue'])
plt.title('National Books', fontsize=16, fontweight='bold')
plt.xlabel('Field', fontweight='bold')
plt.ylabel('Number', fontweight='bold')
# 自定义X轴样式:斜45° 文字最末端对齐
ax = plt.gca()
ax.set_xticklabels(books.Field, rotation=45, ha='right')
# 将图片左边和底部固定
figures = plt.gcf()
figures.subplots_adjust(left=0.2,bottom=0.42)
# plt.tight_layout()
plt.show()
叠加柱状图,水平柱状图
import pandas as pd
import matplotlib.pyplot as plt
users = pd.read_excel('./TestExcel/output010.xlsx')
users['Total'] = users.Oct+users.Nov+users.Dec
users.sort_values(by='Total',ascending=False,inplace=True)
print(users)
# 分组柱状图-->叠加柱状图stacked
users.plot.bar(x='Name',y=['Oct','Nov','Dec'],stacked=True)
plt.title('User Behavior',fontsize=24,fontweight='bold')
# 旋转X轴文字45°
ax = plt.gca()
ax.set_xticklabels(users.Name,rotation=45,ha='right')
plt.tight_layout()
# 水平柱状图
users.sort_values(by='Total',ascending=True,inplace=True)
users.plot.barh(x='Name',y=['Oct','Nov','Dec'],stacked=True)
plt.tight_layout()
plt.show()
饼图
import pandas as pd
import matplotlib.pyplot as plt
students = pd.read_excel('./TestExcel/output011.xlsx',index_col='From')
print(students)
# 绘制饼图,默认情况下以index_col作为图例显示饼图
# 排序以顺时针形式呈现,并将开始位置设置为-250
# students['2017'].sort_values().plot.pie(startangle=-250)
# 第二种方式counterclock=False顺时针方式呈现
students['2017'].plot.pie(counterclock=False)
plt.title('Source of International Students')
plt.ylabel('2017',fontweight='bold')
plt.show()
折线趋势图,叠加区域图
import pandas as pd
import matplotlib.pyplot as plt
weeks = pd.read_excel('./TestExcel/output012.xlsx',index_col='Week')
print(weeks)
print(weeks.columns)
# 折线图
weeks.plot(y=['Accessories', 'Bikes', 'Clothing', 'Components', 'Grand Total'])
plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
plt.ylabel('Total',fontsize=12,fontweight='bold')
plt.xticks(weeks.index)
# 叠加区域图
weeks.plot.area(y=['Accessories', 'Bikes', 'Clothing', 'Components', 'Grand Total'])
plt.title('Sales Weekly Trend',fontsize=16,fontweight='bold')
plt.ylabel('Total',fontsize=12,fontweight='bold')
plt.xticks(weeks.index)
plt.show()
散点图、直方图、密度图、数据相关性
import pandas as pd
import matplotlib.pyplot as plt
# 显示所有列
pd.options.display.max_columns =777
homes = pd.read_excel('./TestExcel/output013.xlsx')
# print(homes.head())
# 散点图scatter
#homes.plot.scatter(x='sqft_living',y='price')
# 直方图hist 分布区间bins
# homes.sqft_living.plot.hist(bins=100)
# plt.xticks(range(0,max(homes.sqft_living),5000),fontsize=8,rotation=90)
# 密度图kde
homes.sqft_living.plot.kde()
plt.xticks(range(0,max(homes.sqft_living),5000),fontsize=8,rotation=90)
plt.show()
# 数据相关性分析 column row relation
print(homes.corr())
多表联合(从VLOOKUP到JOIN)
连接了两张表,一张学生姓名表,一张分数表
from typing import SupportsRound
import pandas as pd
students = pd.read_excel('./TestExcel/output014.xlsx',sheet_name='Students',index_col='ID')
scores = pd.read_excel('./TestExcel/output014.xlsx',sheet_name='Scores',index_col='ID')
# students表join scores表 how表示不论是否能查询到数据始终保持students表的数据 以ID列来join
# fillna 用0来填充空值
# 如果左右两边ID列名字不一样 left_on right_on
# table = students.merge(scores,how='left',left_on=students.index, right_on=scores.index).fillna(0)
table = students.join(scores,how='left').fillna(0)
# 将浮点数转为原始的int数据
table.Score = table.Score.astype(int)
print(table)
数据校验、轴的概念
import pandas as pd
def validate_score(x):
"""[校验函数]
Args:
x ([object]): [学生信息]
"""
try:
assert 0<=x.Score<=100
except:
print(f'#{x.ID}\tStudent {x.Name} has an invalid score {x.Score}')
#第二种写法
#if not 0<=x.Score<=100:
#print(f'#{x.ID}\tStudent {x.Name} has an invalid score {x.Score}')
students = pd.read_excel('./TestExcel/output015.xlsx')
# axis默认为0,从上到下 1,从左到右进行校验
students.apply(validate_score,axis=1)
# print(students)
把一列数据分割成两列
import pandas as pd
Students = pd.read_excel('./TestExcel/output015.xlsx')
# 一列数据分割成两列expand=True n=0切出来多少子字符串就保留多少
df = Students.Name.str.split('_',expand=True)
Students['StudentTitle'] = df[0].str.upper()
Students['StudentIndex'] = df[1]
print(Students)
求和、求平均、统计引导
import pandas as pd
house_price = pd.read_excel('./TestExcel/output013.xlsx')
# DataFrame.sum()
# axis 1横轴 0纵轴 sum求和 mean求平均
temp_total = house_price[['bedrooms','bathrooms']].sum(axis=1)
temp_average = house_price[['bedrooms','bathrooms']].mean(axis=1)
house_price['TotalRooms'] = temp_total
house_price['AverageRooms'] = temp_average
# house_price['AverageRooms'] .astype(int)
# 追加列方向的累加
col_sum = house_price[['ID','price','bedrooms','bathrooms','sqft_living','sqft_basement']].sum()
col_sum['TotalRooms'] = 'Summary'
# 将自动按照col_sum的列名来进行追加
house_price = house_price.append(col_sum,ignore_index=True)
print(house_price)
定位、消除重复数据
from os import dup
import pandas as pd
students = pd.read_excel('./TestExcel/output009.xlsx')
row_sum = students[['Firstyear','Lastyear']].sum(axis=1)
students['TotalSales'] = row_sum
col_sum = students[['Firstyear','Lastyear','TotalSales']].sum()
col_sum['Field'] = 'Summary'
students = students.append(col_sum,ignore_index=True)
print(students)
print('\n')
# 取出那些重复的数据
dupe = students.duplicated(subset='Field')
print(dupe.any())
print(dupe.count())
print(dupe[dupe]) #自己和True比较
dupe = dupe[dupe==True]
print(f'{dupe.index}\n')
print(f'{students.iloc[dupe.index]}\n') #取出那些数据
# 消除重复数据,基于哪些列 subset=['Field',Firstyear'] keep='first'/'last'保留开始还是后面的
students.drop_duplicates(subset='Field',inplace=True,keep='first')
print(students)
旋转数据表(行转换)
import pandas as pd
# 显示最大列
pd.options.display.max_columns = 999
videos = pd.read_excel('./TestExcel/output016.xlsx',index_col='Month')
# 旋转表格,要将index_col设置为旋转坐标列Month,不然就会以默认index来作为列名
videos = videos.transpose()
print(videos)
读取CVS、TSV、TXT文件中的数据
import pandas as pd
students1 = pd.read_csv('./TestOtherFiles/Students.csv')
print(students1)
students2 = pd.read_csv('./TestOtherFiles/Students.txt',sep='|')
print(students2)
透视表、分组、聚合(GroupBy)
import pandas as pd
import numpy as np
pd.options.display.max_columns = 777
orders = pd.read_excel('./TestExcel/output017.xlsx')
orders['Year'] = pd.DatetimeIndex(orders.Date).year
print(orders)
piovot_table1=orders.pivot_table(index='Category',columns='Year',values='Total',aggfunc=np.sum)
print(piovot_table1)
groups = orders.groupby(['Category','Year'])
s = groups['Total'].sum()
c = groups['ID'].count()
piovot_table2 = pd.DataFrame({'Sum':s,'Count':c})
print(piovot_table2)
线性回归、数据预测
数据分析和机器学习的边界
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
def validate_revenue(series):
if not 5<series.Revenue<20:
print(f'{series}值不在范围内')
sales = pd.read_excel('./TestExcel/output018.xlsx',dtype={'Month':str})
# 斜率,截距
slope,intercept,r,p,std_err = linregress(sales.index,sales.Revenue)
# 线性回归方程
exp = sales.index*slope+intercept
# sales.plot.area(y='Revenue')
# 散点图
plt.scatter(x=sales.index,y=sales['Revenue'],color='orange')
plt.plot(sales.index,exp,color='blue')
#plt.bar(sales.index,sales.Revenue)
ax = plt.gca()
ax.set_xticklabels(sales['Month'],rotation=45,ha='right')
plt.title(f'y={slope}*X+{intercept}')
plt.tight_layout()
plt.show()
# 横向验证,传入函数的是series
#sales.apply(validate_revenue,axis=1)
#print(sales.Month.dtype)
条件格式化
anacoda input: jupyter notebook
import pandas as pd
def low_score_red(s):
color = 'red' if s<21000 else 'black'
return f'color:{color}'
def highest_score_green(col):
#把字体编程绿色,底层变为黄绿色
return ['background-color:lime;color:green' if s==col.max() else 'background-color:white' for s in col]
students = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx')
students.style.applymap(low_score_red,subset=['2016','2017'])\
.apply(highest_score_green,subset=['2016','2017'])
import pandas as pd
import seaborn as sns
color_map = sns.light_palette('green',as_cmap=True)
students = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx')
# 将颜色深度应用到数据列上
students.style.background_gradient(color_map,subset=['2016','2017'])
import pandas as pd
#import seaborn as sns
#color_map = sns.light_palette('green',as_cmap=True)
students = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx')
# 将颜色深度应用到数据列上
# students.style.background_gradient(color_map,subset=['2016','2017'])
students.style.bar(color='orange',subset=['2016','2017'])
行操作集锦
import pandas as pd
students1 = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx',sheet_name='Sheet1')
students2 = pd.read_excel('F:/Repo/albertpython/TestExcel/output011.xlsx',sheet_name='Sheet2')
# 两个Sheet合并为一个,重置index_col drop=True放弃原来的
student_total = students1.append(students2).reset_index(drop=True)
# 追加手动创建的一行
stu = pd.Series({'Rank':100,'From':'USA','2016':90,'2017':100})
student_total=student_total.append(stu,ignore_index=True)
# 更改其中一行的值
student_total['From'].at[13] = 'Mexico3'
student_total.at[12,'From'] = 'Japan3'
# 插入一行 切片技术
stu = pd.Series({'Rank':101,'From':'USB','2016':90,'2017':100})
# 左闭右开
part1 = student_total[:10]
part2 = student_total[10:]
student_total = part1.append(stu,ignore_index=True).append(part2).reset_index(drop=True)
# 删除数据行
# student_total.drop(index=[0,1,2],inplace=True)
# student_total.drop(index=range(4,6),inplace=True)
for i in range(5,8):
student_total['From'].at[i] =''
missing_data = student_total.loc[student_total['From']=='']
student_total.drop(index=missing_data.index,inplace=True) #切片的方式传入
student_total.reset_index(drop=True) # 重新铺一下index
print(student_total)
列操作集锦
import pandas as pd
import numpy as np
students = pd.read_excel('./TestExcel/output014.xlsx',sheet_name='Students')
scores = pd.read_excel('./TestExcel/output014.xlsx',sheet_name='Scores')
# 列横向拼接
list_total = pd.concat([students,scores],axis=1).reset_index(drop=True)
print(list_total)
# 行追加第二种写法
list_total2 = pd.concat([students,scores],ignore_index=True).reset_index(drop=True)
print(list_total2)
# 将ID相同Join进来
list_total3 = students.merge(scores,how='left',on=['ID','ID'])
# 追加一列:
list_total3['Age'] = 25
list_total3['RealAge'] = np.arange(0,len(list_total3))
print(f'Total3:\n{list_total3}\n')
print(np.arange(0,len(list_total3)))
# 删除列
list_total3.drop(columns=['Age','RealAge'],inplace=True)
print(f'DropTotal3:\n{list_total3}\n')
# 插入列
list_total3.insert(1,column='Foo',value=26)
print(f'DropTotal3:\n{list_total3}\n')
# 列名大写
list_total3.rename(columns={'Foo':'FOO'},inplace=True)
print(f'Upper:\n{list_total3}\n')
# 删除空行
list_total3.dropna(inplace=True)
print(f'Dropnan:\n{list_total3}\n')
读取数据库
import pandas as pd
import numpy as np
import pyodbc as dbc
# ORM框架 create_engine
import sqlalchemy
books = pd.read_excel('./TestExcel/output019.xlsx')
print(books)
# 此处注意Server={ServerName} ServerName未必是{local}
connection = dbc.connect('Driver={SQL Server};'
'Server={DESKTOP-07OJ9VD};'
'Database=AlbertBook;'
'Trusted_Connection=yes;')
query = 'Select Title from T_Books'
df1 = pd.read_sql_query(query,connection)
print(df1)
编写复杂方程
import pandas as pd
import numpy as np
def cal_bike_complex_clothing(b,c):
return b**2+c*np.pi
sales = pd.read_excel('./TestExcel/output012.xlsx')
sales['complex'] = sales.apply(lambda row:cal_bike_complex_clothing(row['Bikes'],row['Clothing']),axis=1)
#将列名大写
sales.rename(columns={'complex':'Complex'},inplace=True)
sales.drop(columns={'Components'},inplace=True)
print(sales)