1 读取文件

read_csv

  1. import pandas as pd
  2. # 读取普通tab分隔的文本
  3. df = pd.read_csv('test.tsv', sep='\t')
  4. # 跳过注释行,指定列名
  5. df = pd.read_csv('test.tsv', sep='\t', comment='#', names=['a', 'b', 'c'])
  6. # 读取指定列(按列的下标)
  7. df = pd.read_csv('test.tsv', sep='\t', usecols=[0, 1, 3])
  8. # 读取指定列(按列名)
  9. df = pd.read_csv('test.tsv', sep='\t', usecols=['a', 'b', 'd'])
  10. # 读取压缩文件 {'gzip', 'bz2', 'zip', 'xz'}
  11. df = pd.read_csv('test.tsv.gz', sep='\t', compression='gzip')
  12. # 指定engine {'c', 'python'} c更快,python更完善
  13. df = pd.read_csv('test.tsv.gz', sep='\t', compression='gzip', engine='c')

大文件分块读取

  1. bigfile = 'bigtest.tsv.gz'
  2. chunksize = 10000 # 每次读取的行数
  3. df = pd.DataFrame()
  4. # =================================
  5. # 方式1:直接设定chunksize (推荐)
  6. # =================================
  7. reader = pd.read_csv(bigfile, sep='\t', chunksize=chunksize, compression='gzip')
  8. for chunk in reader:
  9. # you can do some filtration or something else here
  10. df = pd.concat([df, chunk])
  11. # ===========================
  12. # 方式2:设置iterator=True
  13. # ===========================
  14. reader = pd.read_csv(bigfile, sep='\t', iterator=True, compression='gzip')
  15. while True:
  16. try:
  17. chunk = reader.get_chunk(chunksize)
  18. df = pd.concat([df, chunk])
  19. except StopIteration:
  20. break

其他文件读取

  1. pd.read_table # 将废弃,用read_csv(sep='\t')代替
  2. pd.read_excel
  3. pd.read_pickle
  4. pd.read_json
  5. pd.read_html
  6. pd.read_sql
  7. ...

2 写入文件

  1. # 写入文本文件
  2. df.to_csv('out.tsv', sep='\t', index=False)
  3. # 压缩文件
  4. df.to_csv('out.tsv.gz', sep='\t', compression='gzip', index=False)
  5. # 写指定列
  6. df.to_csv('out.tsv', sep='\t', index=False, columns=['a', 'c'])
  7. # 写入pickle
  8. df.to_pickle('out.pkl')
  9. # 写入SQLite3
  10. ## with sqlite3
  11. import sqlite3
  12. con = sqlite3.Connection()
  13. df.to_sql('tablename', con, index=False)
  14. # if_exists: {'fail', 'replace', 'append'}
  15. df.to_sql('tablename', con, if_exists='replcae')
  16. ## with sqlalchemy
  17. from sqlalchemy import create_engine
  18. # in-memory
  19. engine = create_engine('sqlite://', echo=False)
  20. # to a file
  21. engine = create_engine('sqlite:///test.sqlite', echo=False)
  22. df.to_sql('users', con=engine, if_exists='replace', index_label='id'

3 读取网络文件

  1. url = 'http://rest.kegg.jp/list/hsa:100750326+hsa:100422907'
  2. df = pd.read_csv(url, sep='\t', names=['geneid', 'symbol'])