import os
import time
import numpy as np
import pandas as pd
data = np.random.standard_normal((10 ** 7, 4))
df = pd.DataFrame(data)
methods = [
'default',
"zlib",
"lzo",
"bzip2",
"blosc",
'blosc:blosclz',
"blosc:lz4",
"blosc:lz4hc",
"blosc:snappy",
"blosc:zlib",
"blosc:zstd",
]
for method in methods:
level, comp = 9, method
if method == 'default':
level, comp = None, None
start_time = time.time()
file = f"./data/{method.replace(':', '-')}.h5"
df.to_hdf(file, 'df', complevel=level, complib=comp)
print(method, os.path.getsize(file), time.time() - start_time, sep='\t')
性能比较:
method size time
-------------------------------------------------
default 400007240 2.722801923751831
zlib 283500043 166.24143624305725
lzo 305977502 0.4248645305633545
bzip2 291635531 35.23882055282593
blosc 309459970 1.6485953330993652
blosc:blosclz 309459994 1.662555456161499
blosc:lz4 305701404 0.4089069366455078
blosc:lz4hc 293242558 16.87893295288086
blosc:snappy 324139336 0.3809819221496582
blosc:zlib 281797972 164.94474864006042
blosc:zstd 287126493 42.24786901473999