Students_Duplicates.xlsx

1.数据的去重

  1. import pandas as pd
  2. students = pd.read_excel('tmp1\Students_Duplicates.xlsx')
  3. students.drop_duplicates(subset='Name', inplace=True, keep='first')
  4. # 进行数据的去重 keep='first'默认保留一组重复数据的前面几个,last保留后几个
  5. print(students)
  6. """
  7. ID Name Test_1 Test_2 Test_3
  8. 0 1 Student_001 62 86 83
  9. 1 2 Student_002 77 97 78
  10. 2 3 Student_003 57 96 46
  11. 3 4 Student_004 57 87 80
  12. 4 5 Student_005 95 59 87
  13. 5 6 Student_006 56 97 61
  14. 6 7 Student_007 64 91 67
  15. 7 8 Student_008 96 70 48
  16. 8 9 Student_009 77 73 48
  17. 9 10 Student_010 90 94 67
  18. 10 11 Student_011 62 55 63
  19. 11 12 Student_012 83 76 81
  20. 12 13 Student_013 68 60 90
  21. 13 14 Student_014 82 68 98
  22. 14 15 Student_015 61 67 91
  23. 15 16 Student_016 59 63 46
  24. 16 17 Student_017 62 83 93
  25. 17 18 Student_018 90 75 80
  26. 18 19 Student_019 100 95 55
  27. 19 20 Student_020 61 87 100
  28. """

2.拿到重复数据

  1. import pandas as pd
  2. students = pd.read_excel('tmp1\Students_Duplicates.xlsx')
  3. dupe = students.duplicated(subset='Name')
  4. # print(dupe.any()) # True说明有重复数值
  5. dupe1 = dupe[dupe] # [dupe == True]
  6. print(students.iloc[dupe1.index]) # iloc根据index定位数据
  7. """
  8. ID Name Test_1 Test_2 Test_3
  9. 20 21 Student_001 62 86 83
  10. 21 22 Student_002 77 97 78
  11. 22 23 Student_003 57 96 46
  12. 23 24 Student_004 57 87 80
  13. 24 25 Student_005 95 59 87
  14. """