原文件是一个6.8G大小的csv文件,太大了,不好处理,需要将其切分成多个小文件
import osimport timedef mkSubFile(lines, head, srcName, sub):[des_filename, extname] = os.path.splitext(srcName)# 命名格式filename = des_filename + '_' + str(sub) + extnameprint('make file: %s' % filename)fout = open(filename, 'w',encoding='utf-8')try:fout.writelines([head])fout.writelines(lines)return sub + 1finally:fout.close()def splitByLineCount(filename, count):fin = open(filename, 'r',encoding='utf-8')try:head = fin.readline()buf = []sub = 1for line in fin:buf.append(line)if len(buf) == count:sub = mkSubFile(buf, head, filename, sub)buf = []if len(buf) != 0:sub = mkSubFile(buf, head, filename, sub)finally:fin.close()if __name__ == '__main__':begin = time.time()splitByLineCount('D:\\Rexel数据\\EHSY_DATA_V1.csv', 100000) # 100000为切分的行数,每100000切分成一个文件end = time.time()print('time is %d seconds ' % (end - begin))
原文件:
切分后的文件:
