1 读取文件

read_csv

import pandas as pd
# 读取普通tab分隔的文本
df = pd.read_csv('test.tsv', sep='\t')
# 跳过注释行，指定列名
df = pd.read_csv('test.tsv', sep='\t', comment='#', names=['a', 'b', 'c'])
# 读取指定列(按列的下标)
df = pd.read_csv('test.tsv', sep='\t', usecols=[0, 1, 3])
# 读取指定列(按列名)
df = pd.read_csv('test.tsv', sep='\t', usecols=['a', 'b', 'd'])
# 读取压缩文件 {'gzip', 'bz2', 'zip', 'xz'}
df = pd.read_csv('test.tsv.gz', sep='\t', compression='gzip')
# 指定engine {'c', 'python'}  c更快，python更完善
df = pd.read_csv('test.tsv.gz', sep='\t', compression='gzip', engine='c')

大文件分块读取

bigfile = 'bigtest.tsv.gz'
chunksize = 10000  # 每次读取的行数
df = pd.DataFrame()
# =================================
# 方式1：直接设定chunksize    (推荐)
# =================================
reader = pd.read_csv(bigfile, sep='\t', chunksize=chunksize, compression='gzip')
for chunk in reader:
    # you can do some filtration or something else here
    df = pd.concat([df, chunk])
# ===========================
# 方式2：设置iterator=True 
# ===========================
reader = pd.read_csv(bigfile, sep='\t', iterator=True, compression='gzip')
while True:
    try:
        chunk = reader.get_chunk(chunksize)
        df = pd.concat([df, chunk])
    except StopIteration:
        break

其他文件读取

pd.read_table  # 将废弃，用read_csv(sep='\t')代替
pd.read_excel
pd.read_pickle
pd.read_json
pd.read_html
pd.read_sql
...

2 写入文件

# 写入文本文件
df.to_csv('out.tsv', sep='\t', index=False)
# 压缩文件
df.to_csv('out.tsv.gz', sep='\t', compression='gzip', index=False)
# 写指定列
df.to_csv('out.tsv', sep='\t', index=False, columns=['a', 'c'])
# 写入pickle
df.to_pickle('out.pkl')
# 写入SQLite3
## with sqlite3
import sqlite3
con = sqlite3.Connection()
df.to_sql('tablename', con, index=False)
# if_exists: {'fail', 'replace', 'append'}
df.to_sql('tablename', con, if_exists='replcae')
## with sqlalchemy
from sqlalchemy import create_engine
# in-memory
engine = create_engine('sqlite://', echo=False)
# to a file
engine = create_engine('sqlite:///test.sqlite', echo=False)
df.to_sql('users', con=engine, if_exists='replace', index_label='id'

3 读取网络文件

url = 'http://rest.kegg.jp/list/hsa:100750326+hsa:100422907'
df = pd.read_csv(url, sep='\t', names=['geneid', 'symbol'])

Python学习

Pandas文件读写

1 读取文件

read_csv

大文件分块读取

其他文件读取

2 写入文件

3 读取网络文件