Students_Duplicates.xlsx
1.数据的去重
import pandas as pd
students = pd.read_excel('tmp1\Students_Duplicates.xlsx')
students.drop_duplicates(subset='Name', inplace=True, keep='first')
# 进行数据的去重 keep='first'默认保留一组重复数据的前面几个,last保留后几个
print(students)
"""
ID Name Test_1 Test_2 Test_3
0 1 Student_001 62 86 83
1 2 Student_002 77 97 78
2 3 Student_003 57 96 46
3 4 Student_004 57 87 80
4 5 Student_005 95 59 87
5 6 Student_006 56 97 61
6 7 Student_007 64 91 67
7 8 Student_008 96 70 48
8 9 Student_009 77 73 48
9 10 Student_010 90 94 67
10 11 Student_011 62 55 63
11 12 Student_012 83 76 81
12 13 Student_013 68 60 90
13 14 Student_014 82 68 98
14 15 Student_015 61 67 91
15 16 Student_016 59 63 46
16 17 Student_017 62 83 93
17 18 Student_018 90 75 80
18 19 Student_019 100 95 55
19 20 Student_020 61 87 100
"""
2.拿到重复数据
import pandas as pd
students = pd.read_excel('tmp1\Students_Duplicates.xlsx')
dupe = students.duplicated(subset='Name')
# print(dupe.any()) # True说明有重复数值
dupe1 = dupe[dupe] # [dupe == True]
print(students.iloc[dupe1.index]) # iloc根据index定位数据
"""
ID Name Test_1 Test_2 Test_3
20 21 Student_001 62 86 83
21 22 Student_002 77 97 78
22 23 Student_003 57 96 46
23 24 Student_004 57 87 80
24 25 Student_005 95 59 87
"""