1 读取文件
read_csv
import pandas as pd# 读取普通tab分隔的文本df = pd.read_csv('test.tsv', sep='\t')# 跳过注释行,指定列名df = pd.read_csv('test.tsv', sep='\t', comment='#', names=['a', 'b', 'c'])# 读取指定列(按列的下标)df = pd.read_csv('test.tsv', sep='\t', usecols=[0, 1, 3])# 读取指定列(按列名)df = pd.read_csv('test.tsv', sep='\t', usecols=['a', 'b', 'd'])# 读取压缩文件 {'gzip', 'bz2', 'zip', 'xz'}df = pd.read_csv('test.tsv.gz', sep='\t', compression='gzip')# 指定engine {'c', 'python'} c更快,python更完善df = pd.read_csv('test.tsv.gz', sep='\t', compression='gzip', engine='c')
大文件分块读取
bigfile = 'bigtest.tsv.gz'chunksize = 10000 # 每次读取的行数df = pd.DataFrame()# =================================# 方式1:直接设定chunksize (推荐)# =================================reader = pd.read_csv(bigfile, sep='\t', chunksize=chunksize, compression='gzip')for chunk in reader: # you can do some filtration or something else here df = pd.concat([df, chunk])# ===========================# 方式2:设置iterator=True # ===========================reader = pd.read_csv(bigfile, sep='\t', iterator=True, compression='gzip')while True: try: chunk = reader.get_chunk(chunksize) df = pd.concat([df, chunk]) except StopIteration: break
其他文件读取
pd.read_table # 将废弃,用read_csv(sep='\t')代替pd.read_excelpd.read_picklepd.read_jsonpd.read_htmlpd.read_sql...
2 写入文件
# 写入文本文件df.to_csv('out.tsv', sep='\t', index=False)# 压缩文件df.to_csv('out.tsv.gz', sep='\t', compression='gzip', index=False)# 写指定列df.to_csv('out.tsv', sep='\t', index=False, columns=['a', 'c'])# 写入pickledf.to_pickle('out.pkl')# 写入SQLite3## with sqlite3import sqlite3con = sqlite3.Connection()df.to_sql('tablename', con, index=False)# if_exists: {'fail', 'replace', 'append'}df.to_sql('tablename', con, if_exists='replcae')## with sqlalchemyfrom sqlalchemy import create_engine# in-memoryengine = create_engine('sqlite://', echo=False)# to a fileengine = create_engine('sqlite:///test.sqlite', echo=False)df.to_sql('users', con=engine, if_exists='replace', index_label='id'
3 读取网络文件
url = 'http://rest.kegg.jp/list/hsa:100750326+hsa:100422907'df = pd.read_csv(url, sep='\t', names=['geneid', 'symbol'])