大量数据,分块读取。

    1. import pandas as pd
    2. read_path = r'D:\06_items\01_数据处理\dispose_data\aaa.xlsx'
    3. def read_thunk(read_path, nrows):
    4. xl = pd.ExcelFile(read_path)
    5. print('xl--->', xl)
    6. sheet_names = xl.sheet_names
    7. print(sheet_names)
    8. sheet = xl.sheet_names[0]
    9. # 只获取第一行,即表头
    10. df_header = pd.read_excel(read_path, sheet_name=sheet, nrows=1)
    11. print(f"Excel file: {read_path} (worksheet: {sheet})")
    12. chunks = []
    13. i_chunk = 0
    14. skiprows = 1
    15. while True:
    16. df_chunk = pd.read_excel(
    17. read_path, sheet_name=sheet,
    18. nrows=nrows, skiprows=skiprows, header=None)
    19. skiprows += nrows
    20. # 判断读取的行数df_chunk.shape[0],(df.shape[1]表示列数)
    21. if not df_chunk.shape[0]:
    22. break
    23. else:
    24. print(f" - chunk {i_chunk} ({df_chunk.shape[0]} rows)")
    25. chunks.append(df_chunk)
    26. i_chunk += 1
    27. df_chunks = pd.concat(chunks)
    28. columns = {i: col for i, col in enumerate(df_header.columns.tolist())}
    29. df_chunks.rename(columns=columns, inplace=True)
    30. # 连接表头和数据
    31. df = pd.concat([df_header, df_chunks])
    32. return df
    33. if __name__ == '__main__':
    34. # 每次读取1000行
    35. df = read_thunk(read_path, 1000)