1 读取文件
read_csv
import pandas as pd
# 读取普通tab分隔的文本
df = pd.read_csv('test.tsv', sep='\t')
# 跳过注释行,指定列名
df = pd.read_csv('test.tsv', sep='\t', comment='#', names=['a', 'b', 'c'])
# 读取指定列(按列的下标)
df = pd.read_csv('test.tsv', sep='\t', usecols=[0, 1, 3])
# 读取指定列(按列名)
df = pd.read_csv('test.tsv', sep='\t', usecols=['a', 'b', 'd'])
# 读取压缩文件 {'gzip', 'bz2', 'zip', 'xz'}
df = pd.read_csv('test.tsv.gz', sep='\t', compression='gzip')
# 指定engine {'c', 'python'} c更快,python更完善
df = pd.read_csv('test.tsv.gz', sep='\t', compression='gzip', engine='c')
大文件分块读取
bigfile = 'bigtest.tsv.gz'
chunksize = 10000 # 每次读取的行数
df = pd.DataFrame()
# =================================
# 方式1:直接设定chunksize (推荐)
# =================================
reader = pd.read_csv(bigfile, sep='\t', chunksize=chunksize, compression='gzip')
for chunk in reader:
# you can do some filtration or something else here
df = pd.concat([df, chunk])
# ===========================
# 方式2:设置iterator=True
# ===========================
reader = pd.read_csv(bigfile, sep='\t', iterator=True, compression='gzip')
while True:
try:
chunk = reader.get_chunk(chunksize)
df = pd.concat([df, chunk])
except StopIteration:
break
其他文件读取
pd.read_table # 将废弃,用read_csv(sep='\t')代替
pd.read_excel
pd.read_pickle
pd.read_json
pd.read_html
pd.read_sql
...
2 写入文件
# 写入文本文件
df.to_csv('out.tsv', sep='\t', index=False)
# 压缩文件
df.to_csv('out.tsv.gz', sep='\t', compression='gzip', index=False)
# 写指定列
df.to_csv('out.tsv', sep='\t', index=False, columns=['a', 'c'])
# 写入pickle
df.to_pickle('out.pkl')
# 写入SQLite3
## with sqlite3
import sqlite3
con = sqlite3.Connection()
df.to_sql('tablename', con, index=False)
# if_exists: {'fail', 'replace', 'append'}
df.to_sql('tablename', con, if_exists='replcae')
## with sqlalchemy
from sqlalchemy import create_engine
# in-memory
engine = create_engine('sqlite://', echo=False)
# to a file
engine = create_engine('sqlite:///test.sqlite', echo=False)
df.to_sql('users', con=engine, if_exists='replace', index_label='id'
3 读取网络文件
url = 'http://rest.kegg.jp/list/hsa:100750326+hsa:100422907'
df = pd.read_csv(url, sep='\t', names=['geneid', 'symbol'])