DataFrame(数据框)
import numpy as np import pandas as pd
# 定义:是一个类似于表格的数据类型,可以理解为一个二维数组,索引有两个维度,可更改#特点# 列可以是不同的类型# 大小可变# 标记轴(行与列)# 针对行与列进行轴向统计
# 创建一个空的df0 = pd.DataFrame()df0
# 通过列表创建data = [1,2,3,4,5]df1 = pd.DataFrame(data,index = ['s1','s2','s3','s4','s5'],columns = ['sortNUM'])df1
|
sortNUM |
| s1 |
1 |
| s2 |
2 |
| s3 |
3 |
| s4 |
4 |
| s5 |
5 |
#用二维列表创建data = [['Alex',10],['Bob',12],['Clarke',13]]df2 = pd.DataFrame(data,columns=['Name','Age'],dtype=float)df2
|
Name |
Age |
| 0 |
Alex |
10.0 |
| 1 |
Bob |
12.0 |
| 2 |
Clarke |
13.0 |
#列表套字典创建data = [{'a':1,'b':2},{'a':5,'b':10,'c':20}]df3 = pd.DataFrame(data)df3
|
a |
b |
c |
| 0 |
1 |
2 |
NaN |
| 1 |
5 |
10 |
20.0 |
#字典创建data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age': [28,34,29,42]}df4 = pd.DataFrame(data, index=['s1','s2','s3','s4'])df4
|
Name |
Age |
| s1 |
Tom |
28 |
| s2 |
Jack |
34 |
| s3 |
Steve |
29 |
| s4 |
Ricky |
42 |
#字典创建data = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])} df5 = pd.DataFrame(data)df5
|
one |
two |
| a |
1.0 |
1 |
| b |
2.0 |
2 |
| c |
3.0 |
3 |
| d |
NaN |
4 |
属性
df5.index
Index(['a', 'b', 'c', 'd'], dtype='object')
df5.columns
Index(['one', 'two'], dtype='object')
df5.dtypes
one float64two int64dtype: object
df5.info() #信息摘要
<class 'pandas.core.frame.DataFrame'>Index: 4 entries, a to dData columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 one 3 non-null float64 1 two 4 non-null int64 dtypes: float64(1), int64(1)memory usage: 96.0+ bytes
data = pd.read_excel(r"C:\Users\Johnny Zhou\Desktop\用户价值分层企业名单明细.xlsx")
data.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex: 5744 entries, 0 to 5743Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 企业名 5744 non-null object 1 使用产品 5744 non-null int64 2 全产品留存情况 5744 non-null object 3 最近一次发单时间距离3月1日 5744 non-null float64 4 首发时间距离3月1日有多久 5744 non-null float64 5 留存时间 5744 non-null float64 6 累计充值金额 5744 non-null int64 7 用户分类 5744 non-null object dtypes: float64(3), int64(2), object(3)memory usage: 359.1+ KB
df5.values #返回值
array([[ 1., 1.], [ 2., 2.], [ 3., 3.], [nan, 4.]])
df5.axes # 返回轴属性
[Index(['a', 'b', 'c', 'd'], dtype='object'), Index(['one', 'two'], dtype='object')]
df5.ndim #轴数df5.size #大小df5.shape #形状
(4, 2)
d = {'col1': [1, 2], 'col2': [3, 4]}df = pd.DataFrame(data=d)df.dtypes
col1 int64col2 int64dtype: object
df00 = df.astype('int32')df00.dtypes
col1 int32col2 int32dtype: object
df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")), "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")), "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")), "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")), "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), })
df
|
a |
b |
c |
d |
e |
f |
| 0 |
1 |
x |
True |
h |
10.0 |
NaN |
| 1 |
2 |
y |
False |
i |
NaN |
100.5 |
| 2 |
3 |
z |
NaN |
NaN |
20.0 |
200.0 |
df.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex: 3 entries, 0 to 2Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 a 3 non-null int32 1 b 3 non-null object 2 c 2 non-null object 3 d 2 non-null object 4 e 2 non-null float64 5 f 2 non-null float64dtypes: float64(2), int32(1), object(3)memory usage: 260.0+ bytes
df01 = df.convert_dtypes()df01
|
a |
b |
c |
d |
e |
f |
| 0 |
1 |
x |
True |
h |
10 |
NaN |
| 1 |
2 |
y |
False |
i |
|
100.5 |
| 2 |
3 |
z |
|
|
20 |
200.0 |
df01.dtypes
a Int32b stringc booleand stringe Int64f float64dtype: object
df = pd.DataFrame({"A": ["a", 1, 2, 3]})dfdf.dtypes
A objectdtype: object
df = df.iloc[1:]df
df.dtypes
A objectdtype: object
df.infer_objects().dtypes #返回本来的数据类型
A int64dtype: object