原文件是一个6.8G大小的csv文件,太大了,不好处理,需要将其切分成多个小文件

    1. import os
    2. import time
    3. def mkSubFile(lines, head, srcName, sub):
    4. [des_filename, extname] = os.path.splitext(srcName)
    5. # 命名格式
    6. filename = des_filename + '_' + str(sub) + extname
    7. print('make file: %s' % filename)
    8. fout = open(filename, 'w',encoding='utf-8')
    9. try:
    10. fout.writelines([head])
    11. fout.writelines(lines)
    12. return sub + 1
    13. finally:
    14. fout.close()
    15. def splitByLineCount(filename, count):
    16. fin = open(filename, 'r',encoding='utf-8')
    17. try:
    18. head = fin.readline()
    19. buf = []
    20. sub = 1
    21. for line in fin:
    22. buf.append(line)
    23. if len(buf) == count:
    24. sub = mkSubFile(buf, head, filename, sub)
    25. buf = []
    26. if len(buf) != 0:
    27. sub = mkSubFile(buf, head, filename, sub)
    28. finally:
    29. fin.close()
    30. if __name__ == '__main__':
    31. begin = time.time()
    32. splitByLineCount('D:\\Rexel数据\\EHSY_DATA_V1.csv', 100000) # 100000为切分的行数,每100000切分成一个文件
    33. end = time.time()
    34. print('time is %d seconds ' % (end - begin))

    原文件:
    image.png

    切分后的文件:
    image.png