筛选唯一值或不唯一的值
def df_concat_col(df, cols: list, joiner="", new_col="拼接"):
"""
dataframe拼接多列
:param df: 原dataframe
:param cols: 拼接的列
:param joiner: 连接符,默认空
:param new_col: 拼接生成的新列名
"""
df[new_col] = df[cols[0]]
for i in range(1, len(cols)):
df[new_col] = df[new_col].map(str) + joiner + df[cols[i]].map(str)
return df
def df_filter_notunique(df, group_cols: list, unique=False):
"""
筛选出分组列中不唯一的数据
:param df: 原始dataframe
:param group_cols: 分组筛选的列,可多列
:param unique: 是否筛选出唯一值,否则筛选不唯一的值
:return: 筛选后的dataframe
"""
# 多列拼接
tmpdf = df_concat_col(df, cols=group_cols, joiner="-", new_col="拼接")
# print(df)
df_group = tmpdf[group_cols+["拼接"]].groupby("拼接").agg(['count'])[group_cols[0]]
df_group = df_group[df_group['count'] > 1]
duplicate = list(df_group.index) # 返回出现次数大于1的柜号
res = tmpdf[~tmpdf["拼接"].isin(duplicate)] if unique else tmpdf[tmpdf["拼接"].isin(duplicate)]
res = res.drop('拼接', axis=1) # 删除列
return res
if __name__ == '__main__':
import pandas as pd
# 创建示例dataframe
df = pd.DataFrame([["A", "a", 4], ["A", "b", 7], ["B", "a", 2], ["B", "b", 2], ["B", "a", 9]],
columns=["列1", "列2", "列3"])
print(df_filter_notunique(df, group_cols=["列1", "列3"]))