df_test = pd.DataFrame({'col1': ['A']*2+['B'], 'col2': range(3)}) # 查询col1列重复值所在行
df_test.col1.duplicated() # 首次出现不算重复
df_test.query("col1.duplicated()") # 只能筛选出非首次出现的重复值所在行
# 方法一
df_test.duplicated(subset=['col1'], keep=False) # keep=False 参数表示保留所有重复值所在的行
df_test[df_test.duplicated(subset=['col1'], keep=False)]
# 方法二
df_test.query('''col1 in @df_test.query("col1.duplicated()").col1''')
# 方法三
ser_bool = (df_test.col1.value_counts() > 1)
ser_bool[ser_bool]
df_test.query('''col1 in @ser_bool[@ser_bool].index''')
# 方法四
df_bool = (df_test.col1.value_counts() > 1).reset_index()
df_test.merge(df_bool, left_on='col1', right_on='index', suffixes=('', '_y')).query("col1_y")[df_test.columns]
pandas.DataFrame.duplicated
Determines which duplicates (if any) to mark.
first
: Mark duplicates asTrue
except for the first occurrence.last
: Mark duplicates asTrue
except for the last occurrence.- False : Mark all duplicates as
True
.