筛选唯一值或不唯一的值
def df_concat_col(df, cols: list, joiner="", new_col="拼接"): """dataframe拼接多列 :param df: 原dataframe :param cols: 拼接的列 :param joiner: 连接符,默认空 :param new_col: 拼接生成的新列名 """ df[new_col] = df[cols[0]] for i in range(1, len(cols)): df[new_col] = df[new_col].map(str) + joiner + df[cols[i]].map(str) return dfdef df_filter_notunique(df, group_cols: list, unique=False): """筛选出分组列中不唯一的数据 :param df: 原始dataframe :param group_cols: 分组筛选的列,可多列 :param unique: 是否筛选出唯一值,否则筛选不唯一的值 :return: 筛选后的dataframe """ # 多列拼接 tmpdf = df_concat_col(df, cols=group_cols, joiner="-", new_col="拼接") # print(df) df_group = tmpdf[group_cols+["拼接"]].groupby("拼接").agg(['count'])[group_cols[0]] df_group = df_group[df_group['count'] > 1] duplicate = list(df_group.index) # 返回出现次数大于1的柜号 res = tmpdf[~tmpdf["拼接"].isin(duplicate)] if unique else tmpdf[tmpdf["拼接"].isin(duplicate)] res = res.drop('拼接', axis=1) # 删除列 return res
if __name__ == '__main__': import pandas as pd # 创建示例dataframe df = pd.DataFrame([["A", "a", 4], ["A", "b", 7], ["B", "a", 2], ["B", "b", 2], ["B", "a", 9]], columns=["列1", "列2", "列3"]) print(df_filter_notunique(df, group_cols=["列1", "列3"]))