DataFrame(数据框)

    1. import numpy as np
    2. import pandas as pd
    1. # 定义:是一个类似于表格的数据类型,可以理解为一个二维数组,索引有两个维度,可更改
    2. #特点
    3. # 列可以是不同的类型
    4. # 大小可变
    5. # 标记轴(行与列)
    6. # 针对行与列进行轴向统计
    1. # 创建一个空的
    2. df0 = pd.DataFrame()
    3. df0
    1. # 通过列表创建
    2. data = [1,2,3,4,5]
    3. df1 = pd.DataFrame(data,index = ['s1','s2','s3','s4','s5'],columns = ['sortNUM'])
    4. df1
    sortNUM
    s1 1
    s2 2
    s3 3
    s4 4
    s5 5
    1. #用二维列表创建
    2. data = [['Alex',10],['Bob',12],['Clarke',13]]
    3. df2 = pd.DataFrame(data,columns=['Name','Age'],dtype=float)
    4. df2
    Name Age
    0 Alex 10.0
    1 Bob 12.0
    2 Clarke 13.0
    1. #列表套字典创建
    2. data = [{'a':1,'b':2},{'a':5,'b':10,'c':20}]
    3. df3 = pd.DataFrame(data)
    4. df3
    a b c
    0 1 2 NaN
    1 5 10 20.0
    1. #字典创建
    2. data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age': [28,34,29,42]}
    3. df4 = pd.DataFrame(data, index=['s1','s2','s3','s4'])
    4. df4
    Name Age
    s1 Tom 28
    s2 Jack 34
    s3 Steve 29
    s4 Ricky 42
    1. #字典创建
    2. data = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
    3. 'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
    4. df5 = pd.DataFrame(data)
    5. df5
    one two
    a 1.0 1
    b 2.0 2
    c 3.0 3
    d NaN 4

    属性

    1. df5.index
    1. Index(['a', 'b', 'c', 'd'], dtype='object')
    1. df5.columns
    1. Index(['one', 'two'], dtype='object')
    1. df5.dtypes
    1. one float64
    2. two int64
    3. dtype: object
    1. df5.info() #信息摘要
    1. <class 'pandas.core.frame.DataFrame'>
    2. Index: 4 entries, a to d
    3. Data columns (total 2 columns):
    4. # Column Non-Null Count Dtype
    5. --- ------ -------------- -----
    6. 0 one 3 non-null float64
    7. 1 two 4 non-null int64
    8. dtypes: float64(1), int64(1)
    9. memory usage: 96.0+ bytes
    1. data = pd.read_excel(r"C:\Users\Johnny Zhou\Desktop\用户价值分层企业名单明细.xlsx")
    1. data.info()
    1. <class 'pandas.core.frame.DataFrame'>
    2. RangeIndex: 5744 entries, 0 to 5743
    3. Data columns (total 8 columns):
    4. # Column Non-Null Count Dtype
    5. --- ------ -------------- -----
    6. 0 企业名 5744 non-null object
    7. 1 使用产品 5744 non-null int64
    8. 2 全产品留存情况 5744 non-null object
    9. 3 最近一次发单时间距离3月1日 5744 non-null float64
    10. 4 首发时间距离3月1日有多久 5744 non-null float64
    11. 5 留存时间 5744 non-null float64
    12. 6 累计充值金额 5744 non-null int64
    13. 7 用户分类 5744 non-null object
    14. dtypes: float64(3), int64(2), object(3)
    15. memory usage: 359.1+ KB
    1. df5.values #返回值
    1. array([[ 1., 1.],
    2. [ 2., 2.],
    3. [ 3., 3.],
    4. [nan, 4.]])
    1. df5.axes # 返回轴属性
    1. [Index(['a', 'b', 'c', 'd'], dtype='object'),
    2. Index(['one', 'two'], dtype='object')]
    1. df5.ndim #轴数
    2. df5.size #大小
    3. df5.shape #形状
    1. (4, 2)
    1. d = {'col1': [1, 2], 'col2': [3, 4]}
    2. df = pd.DataFrame(data=d)
    3. df.dtypes
    1. col1 int64
    2. col2 int64
    3. dtype: object
    1. df00 = df.astype('int32')
    2. df00.dtypes
    1. col1 int32
    2. col2 int32
    3. dtype: object
    1. df = pd.DataFrame(
    2. {
    3. "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
    4. "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
    5. "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
    6. "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
    7. "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
    8. "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
    9. }
    10. )
    1. df
    a b c d e f
    0 1 x True h 10.0 NaN
    1 2 y False i NaN 100.5
    2 3 z NaN NaN 20.0 200.0
    1. df.info()
    1. <class 'pandas.core.frame.DataFrame'>
    2. RangeIndex: 3 entries, 0 to 2
    3. Data columns (total 6 columns):
    4. # Column Non-Null Count Dtype
    5. --- ------ -------------- -----
    6. 0 a 3 non-null int32
    7. 1 b 3 non-null object
    8. 2 c 2 non-null object
    9. 3 d 2 non-null object
    10. 4 e 2 non-null float64
    11. 5 f 2 non-null float64
    12. dtypes: float64(2), int32(1), object(3)
    13. memory usage: 260.0+ bytes
    1. df01 = df.convert_dtypes()
    2. df01
    a b c d e f
    0 1 x True h 10 NaN
    1 2 y False i 100.5
    2 3 z 20 200.0
    1. df01.dtypes
    1. a Int32
    2. b string
    3. c boolean
    4. d string
    5. e Int64
    6. f float64
    7. dtype: object
    1. df = pd.DataFrame({"A": ["a", 1, 2, 3]})
    2. df
    3. df.dtypes
    1. A object
    2. dtype: object
    1. df = df.iloc[1:]
    2. df
    A
    1 1
    2 2
    3 3
    1. df.dtypes
    1. A object
    2. dtype: object
    1. df.infer_objects().dtypes #返回本来的数据类型
    1. A int64
    2. dtype: object