筛选唯一值或不唯一的值

  1. def df_concat_col(df, cols: list, joiner="", new_col="拼接"):
  2. """
  3. dataframe拼接多列
  4. :param df: 原dataframe
  5. :param cols: 拼接的列
  6. :param joiner: 连接符,默认空
  7. :param new_col: 拼接生成的新列名
  8. """
  9. df[new_col] = df[cols[0]]
  10. for i in range(1, len(cols)):
  11. df[new_col] = df[new_col].map(str) + joiner + df[cols[i]].map(str)
  12. return df
  13. def df_filter_notunique(df, group_cols: list, unique=False):
  14. """
  15. 筛选出分组列中不唯一的数据
  16. :param df: 原始dataframe
  17. :param group_cols: 分组筛选的列,可多列
  18. :param unique: 是否筛选出唯一值,否则筛选不唯一的值
  19. :return: 筛选后的dataframe
  20. """
  21. # 多列拼接
  22. tmpdf = df_concat_col(df, cols=group_cols, joiner="-", new_col="拼接")
  23. # print(df)
  24. df_group = tmpdf[group_cols+["拼接"]].groupby("拼接").agg(['count'])[group_cols[0]]
  25. df_group = df_group[df_group['count'] > 1]
  26. duplicate = list(df_group.index) # 返回出现次数大于1的柜号
  27. res = tmpdf[~tmpdf["拼接"].isin(duplicate)] if unique else tmpdf[tmpdf["拼接"].isin(duplicate)]
  28. res = res.drop('拼接', axis=1) # 删除列
  29. return res
  1. if __name__ == '__main__':
  2. import pandas as pd
  3. # 创建示例dataframe
  4. df = pd.DataFrame([["A", "a", 4], ["A", "b", 7], ["B", "a", 2], ["B", "b", 2], ["B", "a", 9]],
  5. columns=["列1", "列2", "列3"])
  6. print(df_filter_notunique(df, group_cols=["列1", "列3"]))