大量数据,分块读取。
import pandas as pd
read_path = r'D:\06_items\01_数据处理\dispose_data\aaa.xlsx'
def read_thunk(read_path, nrows):
xl = pd.ExcelFile(read_path)
print('xl--->', xl)
sheet_names = xl.sheet_names
print(sheet_names)
sheet = xl.sheet_names[0]
# 只获取第一行,即表头
df_header = pd.read_excel(read_path, sheet_name=sheet, nrows=1)
print(f"Excel file: {read_path} (worksheet: {sheet})")
chunks = []
i_chunk = 0
skiprows = 1
while True:
df_chunk = pd.read_excel(
read_path, sheet_name=sheet,
nrows=nrows, skiprows=skiprows, header=None)
skiprows += nrows
# 判断读取的行数df_chunk.shape[0],(df.shape[1]表示列数)
if not df_chunk.shape[0]:
break
else:
print(f" - chunk {i_chunk} ({df_chunk.shape[0]} rows)")
chunks.append(df_chunk)
i_chunk += 1
df_chunks = pd.concat(chunks)
columns = {i: col for i, col in enumerate(df_header.columns.tolist())}
df_chunks.rename(columns=columns, inplace=True)
# 连接表头和数据
df = pd.concat([df_header, df_chunks])
return df
if __name__ == '__main__':
# 每次读取1000行
df = read_thunk(read_path, 1000)