基础 - DataFrame定义和创建及属性 - 《数据分析》

DataFrame(数据框)

import numpy as np 
import pandas as pd

# 定义：是一个类似于表格的数据类型，可以理解为一个二维数组，索引有两个维度，可更改
#特点
# 列可以是不同的类型
# 大小可变
# 标记轴（行与列）
# 针对行与列进行轴向统计

# 创建一个空的
df0 = pd.DataFrame()
df0

# 通过列表创建
data = [1,2,3,4,5]
df1 = pd.DataFrame(data,index = ['s1','s2','s3','s4','s5'],columns = ['sortNUM'])
df1

	sortNUM
s1	1
s2	2
s3	3
s4	4
s5	5

#用二维列表创建
data = [['Alex',10],['Bob',12],['Clarke',13]]
df2 = pd.DataFrame(data,columns=['Name','Age'],dtype=float)
df2

	Name	Age
0	Alex	10.0
1	Bob	12.0
2	Clarke	13.0

#列表套字典创建
data = [{'a':1,'b':2},{'a':5,'b':10,'c':20}]
df3 = pd.DataFrame(data)
df3

	a	b	c
0	1	2	NaN
1	5	10	20.0

#字典创建
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age': [28,34,29,42]}
df4 = pd.DataFrame(data, index=['s1','s2','s3','s4'])
df4

	Name	Age
s1	Tom	28
s2	Jack	34
s3	Steve	29
s4	Ricky	42

#字典创建
data = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
        'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])} 
df5 = pd.DataFrame(data)
df5

	one	two
a	1.0	1
b	2.0	2
c	3.0	3
d	NaN	4

属性

df5.index

Index(['a', 'b', 'c', 'd'], dtype='object')

df5.columns

Index(['one', 'two'], dtype='object')

df5.dtypes

one    float64
two      int64
dtype: object

df5.info()  #信息摘要

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     3 non-null      float64
 1   two     4 non-null      int64  
dtypes: float64(1), int64(1)
memory usage: 96.0+ bytes

data = pd.read_excel(r"C:\Users\Johnny Zhou\Desktop\用户价值分层企业名单明细.xlsx")

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5744 entries, 0 to 5743
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   企业名             5744 non-null   object 
 1   使用产品            5744 non-null   int64  
 2   全产品留存情况         5744 non-null   object 
 3   最近一次发单时间距离3月1日  5744 non-null   float64
 4   首发时间距离3月1日有多久   5744 non-null   float64
 5   留存时间            5744 non-null   float64
 6   累计充值金额          5744 non-null   int64  
 7   用户分类            5744 non-null   object 
dtypes: float64(3), int64(2), object(3)
memory usage: 359.1+ KB

df5.values #返回值

array([[ 1.,  1.],
       [ 2.,  2.],
       [ 3.,  3.],
       [nan,  4.]])

df5.axes # 返回轴属性

[Index(['a', 'b', 'c', 'd'], dtype='object'),
 Index(['one', 'two'], dtype='object')]

df5.ndim  #轴数
df5.size  #大小
df5.shape  #形状

(4, 2)

d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df.dtypes

col1    int64
col2    int64
dtype: object

df00 = df.astype('int32')
df00.dtypes

col1    int32
col2    int32
dtype: object

df = pd.DataFrame(
    {
        "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
        "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
        "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
        "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
        "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
        "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
    }
)

df

	a	b	c	d	e	f
0	1	x	True	h	10.0	NaN
1	2	y	False	i	NaN	100.5
2	3	z	NaN	NaN	20.0	200.0

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      int32  
 1   b       3 non-null      object 
 2   c       2 non-null      object 
 3   d       2 non-null      object 
 4   e       2 non-null      float64
 5   f       2 non-null      float64
dtypes: float64(2), int32(1), object(3)
memory usage: 260.0+ bytes

df01 = df.convert_dtypes()
df01

	a	b	c	d	e	f
0	1	x	True	h	10	NaN
1	2	y	False	i		100.5
2	3	z			20	200.0

df01.dtypes

a      Int32
b     string
c    boolean
d     string
e      Int64
f    float64
dtype: object

df = pd.DataFrame({"A": ["a", 1, 2, 3]})
df
df.dtypes

A    object
dtype: object

df = df.iloc[1:]
df

	A
1	1
2	2
3	3

df.dtypes

A    object
dtype: object

df.infer_objects().dtypes  #返回本来的数据类型

A    int64
dtype: object