pandas分块读取大批量Excel数据 - 《00_Bug 解决记录文档》

大量数据，分块读取。
import pandas as pd
read_path = r'D:\06_items\01_数据处理\dispose_data\aaa.xlsx'
def read_thunk(read_path, nrows):
    xl = pd.ExcelFile(read_path)
    print('xl--->', xl)
    sheet_names = xl.sheet_names
    print(sheet_names)
    sheet = xl.sheet_names[0]
    # 只获取第一行，即表头
    df_header = pd.read_excel(read_path, sheet_name=sheet, nrows=1)
    print(f"Excel file: {read_path} (worksheet: {sheet})")
    chunks = []
    i_chunk = 0
    skiprows = 1
    while True:
        df_chunk = pd.read_excel(
            read_path, sheet_name=sheet,
            nrows=nrows, skiprows=skiprows, header=None)
        skiprows += nrows
        # 判断读取的行数df_chunk.shape[0]，(df.shape[1]表示列数)
        if not df_chunk.shape[0]:
            break
        else:
            print(f"  - chunk {i_chunk} ({df_chunk.shape[0]} rows)")
            chunks.append(df_chunk)
            i_chunk += 1
            df_chunks = pd.concat(chunks)
            columns = {i: col for i, col in enumerate(df_header.columns.tolist())}
            df_chunks.rename(columns=columns, inplace=True)
            # 连接表头和数据
            df = pd.concat([df_header, df_chunks])
            return df
if __name__ == '__main__':
    # 每次读取1000行
    df = read_thunk(read_path, 1000)